alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3601 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +11 -0
- alita_sdk/configurations/ado.py +148 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +76 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +81 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +111 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +72 -3
- alita_sdk/configurations/report_portal.py +115 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +167 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +101 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +21 -4
- alita_sdk/runtime/clients/client.py +458 -67
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +352 -0
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +183 -43
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +407 -92
- alita_sdk/runtime/langchain/utils.py +102 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +28 -0
- alita_sdk/runtime/toolkits/application.py +14 -4
- alita_sdk/runtime/toolkits/artifact.py +24 -9
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +780 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +314 -70
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +24 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +367 -33
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +218 -0
- alita_sdk/runtime/tools/llm.py +1013 -177
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +69 -65
- alita_sdk/runtime/tools/vectorstore_base.py +163 -90
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +48 -0
- alita_sdk/tools/__init__.py +135 -37
- alita_sdk/tools/ado/__init__.py +2 -2
- alita_sdk/tools/ado/repos/__init__.py +15 -19
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
- alita_sdk/tools/ado/test_plan/__init__.py +26 -8
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
- alita_sdk/tools/ado/wiki/__init__.py +27 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
- alita_sdk/tools/ado/work_item/__init__.py +27 -12
- alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +13 -8
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +454 -110
- alita_sdk/tools/bitbucket/__init__.py +27 -19
- alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +41 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +18 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +11 -7
- alita_sdk/tools/cloud/azure/__init__.py +11 -7
- alita_sdk/tools/cloud/gcp/__init__.py +11 -7
- alita_sdk/tools/cloud/k8s/__init__.py +11 -7
- alita_sdk/tools/code/linter/__init__.py +9 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +20 -13
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +21 -14
- alita_sdk/tools/confluence/api_wrapper.py +197 -58
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +11 -5
- alita_sdk/tools/elastic/__init__.py +10 -8
- alita_sdk/tools/elitea_base.py +546 -64
- alita_sdk/tools/figma/__init__.py +11 -8
- alita_sdk/tools/figma/api_wrapper.py +352 -153
- alita_sdk/tools/github/__init__.py +17 -17
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +81 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/gitlab/__init__.py +18 -13
- alita_sdk/tools/gitlab/api_wrapper.py +224 -80
- alita_sdk/tools/gitlab_org/__init__.py +13 -10
- alita_sdk/tools/google/bigquery/__init__.py +13 -13
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +20 -11
- alita_sdk/tools/jira/__init__.py +21 -11
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +10 -8
- alita_sdk/tools/localgit/__init__.py +8 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +38 -14
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +10 -8
- alita_sdk/tools/openapi/__init__.py +281 -108
- alita_sdk/tools/openapi/api_wrapper.py +883 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +18 -11
- alita_sdk/tools/pandas/api_wrapper.py +40 -45
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +10 -11
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +10 -10
- alita_sdk/tools/qtest/__init__.py +21 -14
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +12 -10
- alita_sdk/tools/report_portal/__init__.py +22 -16
- alita_sdk/tools/salesforce/__init__.py +21 -16
- alita_sdk/tools/servicenow/__init__.py +20 -16
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +16 -14
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +11 -7
- alita_sdk/tools/sql/__init__.py +21 -19
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +20 -13
- alita_sdk/tools/testrail/__init__.py +12 -11
- alita_sdk/tools/testrail/api_wrapper.py +214 -46
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +182 -62
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +17 -14
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/yagmail/__init__.py +8 -3
- alita_sdk/tools/zephyr/__init__.py +11 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
- alita_sdk/tools/zephyr_essential/__init__.py +15 -10
- alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +12 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
- alita_sdk/tools/zephyr_squad/__init__.py +11 -7
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
- alita_sdk-0.3.562.dist-info/RECORD +450 -0
- alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
- alita_sdk/tools/bitbucket/tools.py +0 -304
- alita_sdk-0.3.257.dist-info/RECORD +0 -343
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1469 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-based extractors for document classification, schema discovery,
|
|
3
|
+
entity extraction, and relation extraction.
|
|
4
|
+
|
|
5
|
+
Supports comprehensive entity types across multiple layers:
|
|
6
|
+
- Product Layer: Features, Epics, User Stories, Screens, UX Flows
|
|
7
|
+
- Domain Layer: Business Objects, Rules, Glossary Terms
|
|
8
|
+
- Service Layer: APIs, Endpoints, Services, Methods
|
|
9
|
+
- Code Layer: Modules, Classes, Functions
|
|
10
|
+
- Data Layer: Tables, Columns, Constraints
|
|
11
|
+
- Testing Layer: Test Cases, Test Suites, Defects
|
|
12
|
+
- Delivery Layer: Releases, Commits, Tickets
|
|
13
|
+
- Organization Layer: Teams, Owners, Repositories
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import hashlib
|
|
19
|
+
from typing import Any, Optional, List, Dict, Union, Tuple
|
|
20
|
+
|
|
21
|
+
from langchain_core.documents import Document
|
|
22
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
23
|
+
from langchain_core.output_parsers import JsonOutputParser
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ============================================================================
|
|
29
|
+
# COMPREHENSIVE ENTITY & RELATIONSHIP TAXONOMY
|
|
30
|
+
# ============================================================================
|
|
31
|
+
|
|
32
|
+
ENTITY_TAXONOMY = {
|
|
33
|
+
"product_layer": {
|
|
34
|
+
"description": "Product and UX artifacts",
|
|
35
|
+
"types": [
|
|
36
|
+
{"name": "epic", "description": "Large feature grouping or initiative", "properties": ["name", "description", "acceptance_criteria", "priority"]},
|
|
37
|
+
{"name": "feature", "description": "Product capability or functionality", "properties": ["name", "description", "acceptance_criteria", "related_screens"]},
|
|
38
|
+
{"name": "user_story", "description": "User requirement in story format", "properties": ["name", "description", "persona", "acceptance_criteria", "story_points"]},
|
|
39
|
+
{"name": "screen", "description": "UI page, screen, or view", "properties": ["name", "description", "url_path", "parent_screen"]},
|
|
40
|
+
{"name": "ux_flow", "description": "User journey or navigation flow", "properties": ["name", "description", "start_screen", "end_screen", "steps"]},
|
|
41
|
+
{"name": "ui_component", "description": "Reusable UI element (form, button, modal)", "properties": ["name", "description", "component_type", "parent_screen"]},
|
|
42
|
+
{"name": "ui_field", "description": "Input field, dropdown, or display element", "properties": ["name", "field_type", "validation_rules", "api_mapping", "db_mapping"]},
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
"domain_layer": {
|
|
46
|
+
"description": "Business domain concepts",
|
|
47
|
+
"types": [
|
|
48
|
+
{"name": "domain_entity", "description": "Core business object (Customer, Order, Product)", "properties": ["name", "description", "attributes", "lifecycle_states"]},
|
|
49
|
+
{"name": "attribute", "description": "Property of a domain entity", "properties": ["name", "data_type", "constraints", "parent_entity"]},
|
|
50
|
+
{"name": "business_rule", "description": "Business logic or constraint", "properties": ["name", "description", "trigger_event", "conditions", "actions", "exceptions"]},
|
|
51
|
+
{"name": "business_event", "description": "Domain event that triggers actions", "properties": ["name", "description", "trigger", "payload", "handlers"]},
|
|
52
|
+
{"name": "glossary_term", "description": "Domain vocabulary definition", "properties": ["name", "definition", "synonyms", "related_terms"]},
|
|
53
|
+
{"name": "workflow", "description": "Business process or workflow", "properties": ["name", "description", "steps", "triggers", "outcomes"]},
|
|
54
|
+
]
|
|
55
|
+
},
|
|
56
|
+
"service_layer": {
|
|
57
|
+
"description": "APIs and services (semantic descriptions, not code structure)",
|
|
58
|
+
"types": [
|
|
59
|
+
{"name": "service", "description": "Software service or microservice", "properties": ["name", "description", "tech_stack", "owner_team"]},
|
|
60
|
+
|
|
61
|
+
# REST API types (from specs/docs, not code)
|
|
62
|
+
{"name": "rest_api", "description": "REST API specification", "properties": ["name", "description", "version", "auth_schema", "base_url", "content_type"]},
|
|
63
|
+
{"name": "rest_endpoint", "description": "REST API endpoint", "properties": ["name", "method", "path", "request_schema", "response_schema", "auth_required", "status_codes"]},
|
|
64
|
+
|
|
65
|
+
# GraphQL types (from specs/docs)
|
|
66
|
+
{"name": "graphql_api", "description": "GraphQL API schema", "properties": ["name", "description", "version", "endpoint", "auth_schema"]},
|
|
67
|
+
{"name": "graphql_query", "description": "GraphQL query operation", "properties": ["name", "description", "arguments", "return_type"]},
|
|
68
|
+
{"name": "graphql_mutation", "description": "GraphQL mutation operation", "properties": ["name", "description", "arguments", "return_type"]},
|
|
69
|
+
|
|
70
|
+
# Event-driven types (semantic)
|
|
71
|
+
{"name": "event_type", "description": "Event type or message schema", "properties": ["name", "description", "version", "schema", "payload_fields"]},
|
|
72
|
+
|
|
73
|
+
# Integration types
|
|
74
|
+
{"name": "integration", "description": "External system integration", "properties": ["name", "description", "protocol", "external_system", "direction"]},
|
|
75
|
+
]
|
|
76
|
+
},
|
|
77
|
+
# NOTE: code_layer removed - classes, functions, methods, modules, interfaces, constants
|
|
78
|
+
# are now extracted by AST/regex parsers, not LLM entity extraction
|
|
79
|
+
"data_layer": {
|
|
80
|
+
"description": "Database and data artifacts (from docs/specs, not code)",
|
|
81
|
+
"types": [
|
|
82
|
+
{"name": "database", "description": "Database or data store", "properties": ["name", "type", "description"]},
|
|
83
|
+
{"name": "table", "description": "Database table or collection", "properties": ["name", "description", "primary_key", "indexes"]},
|
|
84
|
+
{"name": "column", "description": "Table column or field", "properties": ["name", "data_type", "nullable", "default_value", "constraints", "parent_table"]},
|
|
85
|
+
{"name": "migration", "description": "Database migration script", "properties": ["name", "version", "description", "changes"]},
|
|
86
|
+
{"name": "enum", "description": "Enumeration or lookup values", "properties": ["name", "values", "description"]},
|
|
87
|
+
]
|
|
88
|
+
},
|
|
89
|
+
"testing_layer": {
|
|
90
|
+
"description": "Testing artifacts",
|
|
91
|
+
"types": [
|
|
92
|
+
{"name": "test_suite", "description": "Collection of related test cases", "properties": ["name", "description", "test_type", "coverage_area"]},
|
|
93
|
+
{"name": "test_case", "description": "Individual test case", "properties": ["name", "description", "preconditions", "steps", "expected_result", "priority", "automated"]},
|
|
94
|
+
{"name": "test_data", "description": "Test data set or fixture", "properties": ["name", "description", "data_format", "scope"]},
|
|
95
|
+
{"name": "defect", "description": "Bug or defect report", "properties": ["name", "description", "severity", "status", "steps_to_reproduce", "affected_version"]},
|
|
96
|
+
{"name": "incident", "description": "Production incident", "properties": ["name", "description", "severity", "impact", "root_cause", "resolution"]},
|
|
97
|
+
]
|
|
98
|
+
},
|
|
99
|
+
"delivery_layer": {
|
|
100
|
+
"description": "Delivery and release artifacts",
|
|
101
|
+
"types": [
|
|
102
|
+
{"name": "release", "description": "Software release or version", "properties": ["name", "version", "release_date", "changes", "status"]},
|
|
103
|
+
{"name": "sprint", "description": "Development sprint or iteration", "properties": ["name", "start_date", "end_date", "goals"]},
|
|
104
|
+
{"name": "ticket", "description": "Work item or task ticket", "properties": ["name", "description", "type", "status", "priority", "assignee"]},
|
|
105
|
+
{"name": "deployment", "description": "Deployment to environment", "properties": ["name", "environment", "version", "timestamp", "status"]},
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
"organization_layer": {
|
|
109
|
+
"description": "People and organizational artifacts",
|
|
110
|
+
"types": [
|
|
111
|
+
{"name": "team", "description": "Development team or squad", "properties": ["name", "description", "members", "responsibilities"]},
|
|
112
|
+
{"name": "owner", "description": "Feature or component owner", "properties": ["name", "email", "role", "owned_components"]},
|
|
113
|
+
{"name": "stakeholder", "description": "Business stakeholder", "properties": ["name", "role", "interests", "contact"]},
|
|
114
|
+
{"name": "repository", "description": "Code repository", "properties": ["name", "url", "description", "language", "owner_team"]},
|
|
115
|
+
{"name": "documentation", "description": "Documentation page or article", "properties": ["name", "url", "description", "doc_type", "last_updated"]},
|
|
116
|
+
]
|
|
117
|
+
},
|
|
118
|
+
"tooling_layer": {
|
|
119
|
+
"description": "Tools and integration toolkits",
|
|
120
|
+
"types": [
|
|
121
|
+
{"name": "toolkit", "description": "Integration toolkit or connector (e.g., Jira Toolkit, GitHub Toolkit)", "properties": ["name", "description", "tools", "configuration_fields", "authentication"]},
|
|
122
|
+
{"name": "tool", "description": "Individual tool or capability within a toolkit", "properties": ["name", "description", "parameters", "return_type", "parent_toolkit"]},
|
|
123
|
+
{"name": "mcp_server", "description": "MCP (Model Context Protocol) server", "properties": ["name", "description", "transport", "tools", "resources"]},
|
|
124
|
+
{"name": "mcp_tool", "description": "Tool exposed by an MCP server", "properties": ["name", "description", "input_schema", "parent_server"]},
|
|
125
|
+
{"name": "connector", "description": "External system connector or adapter", "properties": ["name", "description", "target_system", "auth_type", "capabilities"]},
|
|
126
|
+
]
|
|
127
|
+
},
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
RELATIONSHIP_TAXONOMY = {
|
|
131
|
+
# NOTE: Code structural relationships (imports, extends, implements, calls, contains for code)
|
|
132
|
+
# are now extracted by AST/regex parsers, not LLM. This taxonomy is for semantic relationships.
|
|
133
|
+
"structural": {
|
|
134
|
+
"description": "Structural relationships (for non-code entities)",
|
|
135
|
+
"types": [
|
|
136
|
+
{"name": "contains", "description": "Parent contains child (non-code)", "examples": ["screen contains ui_component", "toolkit contains tool", "epic contains feature"]},
|
|
137
|
+
{"name": "part_of", "description": "Part of larger whole", "examples": ["column part_of table", "tool part_of toolkit", "ui_field part_of screen"]},
|
|
138
|
+
{"name": "provides", "description": "Provides capability or resource", "examples": ["toolkit provides tool", "mcp_server provides mcp_tool", "service provides api"]},
|
|
139
|
+
]
|
|
140
|
+
},
|
|
141
|
+
"behavioral": {
|
|
142
|
+
"description": "Behavioral and runtime relationships (semantic, not code-level)",
|
|
143
|
+
"types": [
|
|
144
|
+
{"name": "triggers", "description": "Triggers event or action", "examples": ["business_rule triggers workflow", "event triggers handler"]},
|
|
145
|
+
{"name": "depends_on", "description": "Business/feature dependency", "examples": ["service depends_on service", "feature depends_on feature"]},
|
|
146
|
+
{"name": "uses", "description": "Uses or references", "examples": ["feature uses service", "test_case uses test_data"]},
|
|
147
|
+
{"name": "publishes", "description": "Publishes event", "examples": ["service publishes event_type"]},
|
|
148
|
+
{"name": "subscribes_to", "description": "Subscribes to event", "examples": ["service subscribes_to event_type"]},
|
|
149
|
+
]
|
|
150
|
+
},
|
|
151
|
+
"data_lineage": {
|
|
152
|
+
"description": "Data flow relationships",
|
|
153
|
+
"types": [
|
|
154
|
+
{"name": "stores_in", "description": "Data stored in", "examples": ["ui_field stores_in column", "endpoint stores_in table"]},
|
|
155
|
+
{"name": "reads_from", "description": "Reads data from", "examples": ["endpoint reads_from table", "screen reads_from api"]},
|
|
156
|
+
{"name": "maps_to", "description": "Data mapping", "examples": ["ui_field maps_to column", "attribute maps_to column"]},
|
|
157
|
+
]
|
|
158
|
+
},
|
|
159
|
+
"ui_product": {
|
|
160
|
+
"description": "UI and product relationships",
|
|
161
|
+
"types": [
|
|
162
|
+
{"name": "shown_on", "description": "Displayed on screen/UI", "examples": ["ui_field shown_on screen", "domain_entity shown_on screen"]},
|
|
163
|
+
{"name": "navigates_to", "description": "Navigation link", "examples": ["screen navigates_to screen", "button navigates_to screen"]},
|
|
164
|
+
{"name": "validates", "description": "Validates input", "examples": ["business_rule validates ui_field"]},
|
|
165
|
+
]
|
|
166
|
+
},
|
|
167
|
+
"testing": {
|
|
168
|
+
"description": "Testing relationships",
|
|
169
|
+
"types": [
|
|
170
|
+
{"name": "tests", "description": "Tests functionality", "examples": ["test_case tests feature", "test_case tests endpoint"]},
|
|
171
|
+
{"name": "covers", "description": "Test coverage", "examples": ["test_suite covers feature", "test_case covers user_story"]},
|
|
172
|
+
{"name": "reproduces", "description": "Reproduces defect", "examples": ["test_case reproduces defect"]},
|
|
173
|
+
]
|
|
174
|
+
},
|
|
175
|
+
"ownership": {
|
|
176
|
+
"description": "Ownership and responsibility",
|
|
177
|
+
"types": [
|
|
178
|
+
{"name": "owned_by", "description": "Owned by team/person", "examples": ["service owned_by team", "feature owned_by owner"]},
|
|
179
|
+
{"name": "maintained_by", "description": "Maintained by", "examples": ["repository maintained_by team"]},
|
|
180
|
+
{"name": "assigned_to", "description": "Assigned to person", "examples": ["ticket assigned_to owner", "defect assigned_to owner"]},
|
|
181
|
+
]
|
|
182
|
+
},
|
|
183
|
+
"temporal": {
|
|
184
|
+
"description": "Temporal and versioning relationships",
|
|
185
|
+
"types": [
|
|
186
|
+
{"name": "introduced_in", "description": "Introduced in release", "examples": ["feature introduced_in release", "api introduced_in release"]},
|
|
187
|
+
{"name": "modified_in", "description": "Modified in release", "examples": ["table modified_in migration"]},
|
|
188
|
+
{"name": "blocks", "description": "Blocks progress", "examples": ["defect blocks feature", "ticket blocks ticket"]},
|
|
189
|
+
]
|
|
190
|
+
},
|
|
191
|
+
"semantic": {
|
|
192
|
+
"description": "Semantic and knowledge relationships",
|
|
193
|
+
"types": [
|
|
194
|
+
{"name": "related_to", "description": "General relationship", "examples": ["feature related_to feature", "ticket related_to defect"]},
|
|
195
|
+
{"name": "duplicates", "description": "Duplicate of another", "examples": ["defect duplicates defect"]},
|
|
196
|
+
{"name": "references", "description": "References document", "examples": ["ticket references documentation", "test_case references user_story"]},
|
|
197
|
+
{"name": "documents", "description": "Documents or describes", "examples": ["documentation documents feature", "wiki documents api"]},
|
|
198
|
+
]
|
|
199
|
+
},
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ============================================================================
|
|
204
|
+
# PROMPTS
|
|
205
|
+
# ============================================================================
|
|
206
|
+
|
|
207
|
+
DOCUMENT_CLASSIFIER_PROMPT = """Analyze the following document chunk and classify it into one of these document types:
|
|
208
|
+
- code: Source code files (Python, JavaScript, Java, etc.)
|
|
209
|
+
- api_spec: API specifications (OpenAPI, Swagger, GraphQL schemas)
|
|
210
|
+
- requirements: Requirements documents, user stories, specs
|
|
211
|
+
- architecture: Architecture documentation, design documents
|
|
212
|
+
- config: Configuration files (YAML, JSON config, env files)
|
|
213
|
+
- database: Database schemas, migrations, SQL
|
|
214
|
+
- test: Test files, test cases
|
|
215
|
+
- documentation: General documentation, READMEs, guides
|
|
216
|
+
- ticket: Issue tickets, bug reports, feature requests (Jira, GitHub issues)
|
|
217
|
+
- commit: Git commits, changelogs
|
|
218
|
+
- ui: UI component definitions, screen layouts, UX flows
|
|
219
|
+
- other: Anything that doesn't fit the above
|
|
220
|
+
|
|
221
|
+
Document content:
|
|
222
|
+
---
|
|
223
|
+
{content}
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
Metadata:
|
|
227
|
+
{metadata}
|
|
228
|
+
|
|
229
|
+
Respond with ONLY a JSON object:
|
|
230
|
+
{{"doc_type": "<type>", "confidence": <0.0-1.0>}}
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
SCHEMA_DISCOVERY_PROMPT = """Analyze the following document samples to discover entity types and relationship types for a comprehensive knowledge graph.
|
|
235
|
+
|
|
236
|
+
## Entity Layers to Consider
|
|
237
|
+
|
|
238
|
+
### Product Layer (UI/UX artifacts)
|
|
239
|
+
- Epic, Feature, User Story (product requirements)
|
|
240
|
+
- Screen, Page, View (UI containers)
|
|
241
|
+
- UX Flow, Journey (navigation flows)
|
|
242
|
+
- UI Component, Field (interactive elements with validation rules)
|
|
243
|
+
|
|
244
|
+
### Domain Layer (Business concepts)
|
|
245
|
+
- Domain Entity (Customer, Order, Product - core business objects)
|
|
246
|
+
- Attribute (properties of domain entities)
|
|
247
|
+
- Business Rule (conditions, triggers, exceptions)
|
|
248
|
+
- Business Event (domain events that trigger actions)
|
|
249
|
+
- Glossary Term (vocabulary definitions, synonyms)
|
|
250
|
+
|
|
251
|
+
### Service Layer (APIs and integrations)
|
|
252
|
+
- Service, Microservice
|
|
253
|
+
- API, Endpoint (with method, path, auth)
|
|
254
|
+
- Payload, Schema (request/response structures)
|
|
255
|
+
- Integration (external system connections)
|
|
256
|
+
|
|
257
|
+
### Code Layer (Implementation)
|
|
258
|
+
- Module, Package
|
|
259
|
+
- Class, Interface
|
|
260
|
+
- Function, Method
|
|
261
|
+
- Configuration, Constant
|
|
262
|
+
|
|
263
|
+
### Data Layer (Storage)
|
|
264
|
+
- Database, Table, Collection
|
|
265
|
+
- Column, Field (with type, constraints, nullable)
|
|
266
|
+
- Constraint, Index, Enum
|
|
267
|
+
- Migration Script
|
|
268
|
+
|
|
269
|
+
### Testing Layer
|
|
270
|
+
- Test Suite, Test Case (with preconditions, steps, expected results)
|
|
271
|
+
- Test Data, Fixture
|
|
272
|
+
- Defect, Bug (with severity, reproduction steps)
|
|
273
|
+
- Incident (production issues)
|
|
274
|
+
|
|
275
|
+
### Delivery Layer
|
|
276
|
+
- Release, Version
|
|
277
|
+
- Sprint, Iteration
|
|
278
|
+
- Commit, Pull Request
|
|
279
|
+
- Ticket, Task
|
|
280
|
+
|
|
281
|
+
### Organization Layer
|
|
282
|
+
- Team, Squad (ownership)
|
|
283
|
+
- Owner, SME (subject matter experts)
|
|
284
|
+
- Repository (code location)
|
|
285
|
+
- Documentation (wiki, guides)
|
|
286
|
+
|
|
287
|
+
## Relationship Categories
|
|
288
|
+
|
|
289
|
+
- Structural: contains, extends, implements, imports, part_of
|
|
290
|
+
- Behavioral: calls, triggers, depends_on, uses
|
|
291
|
+
- Data Lineage: stores_in, reads_from, maps_to, transforms
|
|
292
|
+
- UI/Product: shown_on, navigates_to, validates
|
|
293
|
+
- Testing: tests, validates, covers, reproduces
|
|
294
|
+
- Ownership: owned_by, maintained_by, assigned_to, reviewed_by
|
|
295
|
+
- Temporal: introduced_in, removed_in, modified_in, supersedes, blocks
|
|
296
|
+
- Semantic: related_to, duplicates, contradicts, references, synonym_of
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
Document samples:
|
|
301
|
+
---
|
|
302
|
+
{samples}
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
Based on these samples, identify which entity types and relationships are most relevant.
|
|
306
|
+
Group by layer and include properties that would be valuable to extract.
|
|
307
|
+
|
|
308
|
+
Respond with ONLY a JSON object:
|
|
309
|
+
{{
|
|
310
|
+
"entity_types": [
|
|
311
|
+
{{"name": "<type_name>", "layer": "<layer_name>", "description": "<description>", "properties": ["<prop1>", "<prop2>"]}}
|
|
312
|
+
],
|
|
313
|
+
"relation_types": [
|
|
314
|
+
{{"name": "<relation_name>", "category": "<category>", "description": "<description>", "source_types": ["<entity_type>"], "target_types": ["<entity_type>"]}}
|
|
315
|
+
]
|
|
316
|
+
}}
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
ENTITY_EXTRACTION_PROMPT = """Extract semantic entities from the following document for a knowledge graph.
|
|
321
|
+
|
|
322
|
+
NOTE: Code structure entities (classes, functions, methods, modules, interfaces, constants, variables, imports)
|
|
323
|
+
are automatically extracted by AST/regex parsers - DO NOT extract these from code files.
|
|
324
|
+
Focus on extracting SEMANTIC entities that represent business concepts, requirements, and domain knowledge.
|
|
325
|
+
|
|
326
|
+
{schema_section}
|
|
327
|
+
|
|
328
|
+
Document content (with line numbers):
|
|
329
|
+
---
|
|
330
|
+
{content}
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
Source file: {file_path}
|
|
334
|
+
Source toolkit: {source_toolkit}
|
|
335
|
+
|
|
336
|
+
## What to Extract:
|
|
337
|
+
For CODE files (.py, .js, .java, etc.): Extract only semantic entities like:
|
|
338
|
+
- Features, requirements, business rules mentioned in comments/docstrings
|
|
339
|
+
- Domain concepts, glossary terms explained in documentation strings
|
|
340
|
+
- TODOs, FIXMEs, or technical debt notes
|
|
341
|
+
- API contracts or integration points described in comments
|
|
342
|
+
- Test scenarios or acceptance criteria in docstrings
|
|
343
|
+
|
|
344
|
+
For DOCUMENTATION files (.md, .rst, .txt, confluence, etc.): Extract all entities including:
|
|
345
|
+
- Features, requirements, user stories, epics
|
|
346
|
+
- Domain entities, business rules, glossary terms
|
|
347
|
+
- Workflows, processes, procedures
|
|
348
|
+
- Services, APIs, integrations (as described, not as code)
|
|
349
|
+
- Test cases, test suites, defects
|
|
350
|
+
- Teams, owners, stakeholders
|
|
351
|
+
|
|
352
|
+
For each entity provide:
|
|
353
|
+
- A unique ID (use existing identifiers when available)
|
|
354
|
+
- The entity type (from semantic types above, NOT code types like class/function)
|
|
355
|
+
- The line range where this entity is defined or described (at least 3-5 lines minimum)
|
|
356
|
+
- Properties including at minimum: name, description
|
|
357
|
+
|
|
358
|
+
IMPORTANT: line_start and line_end should capture the full context of the entity definition.
|
|
359
|
+
Single-line references (line_start == line_end) are discouraged - expand to include surrounding context.
|
|
360
|
+
|
|
361
|
+
Respond with ONLY a JSON array:
|
|
362
|
+
[
|
|
363
|
+
{{
|
|
364
|
+
"id": "<unique_id>",
|
|
365
|
+
"type": "<entity_type>",
|
|
366
|
+
"name": "<entity_name>",
|
|
367
|
+
"line_start": <start_line_number>,
|
|
368
|
+
"line_end": <end_line_number>,
|
|
369
|
+
"properties": {{
|
|
370
|
+
"description": "<brief_description>",
|
|
371
|
+
...
|
|
372
|
+
}}
|
|
373
|
+
}}
|
|
374
|
+
]
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
RELATION_EXTRACTION_PROMPT = """Extract SEMANTIC relationships between the entities listed below based on the document content.
|
|
379
|
+
|
|
380
|
+
NOTE: Structural code relationships (imports, extends, implements, calls, contains for code elements)
|
|
381
|
+
are automatically extracted by AST/regex parsers - DO NOT extract these from code files.
|
|
382
|
+
Focus on extracting SEMANTIC relationships that represent business logic and domain connections.
|
|
383
|
+
|
|
384
|
+
## Document content:
|
|
385
|
+
---
|
|
386
|
+
{content}
|
|
387
|
+
---
|
|
388
|
+
|
|
389
|
+
## Available Entities (ID -> Name):
|
|
390
|
+
{entities_list}
|
|
391
|
+
|
|
392
|
+
{schema_section}
|
|
393
|
+
|
|
394
|
+
## Instructions:
|
|
395
|
+
1. Look for semantic relationships mentioned or implied in the document
|
|
396
|
+
2. For source_id and target_id, you MUST use EXACTLY the ID shown before the arrow (->)
|
|
397
|
+
|
|
398
|
+
## Relationship Types to Extract:
|
|
399
|
+
For CODE files: Focus on semantic relationships like:
|
|
400
|
+
- tests (test_case tests feature)
|
|
401
|
+
- validates (test validates business_rule)
|
|
402
|
+
- documents (code documents requirement)
|
|
403
|
+
- related_to (feature related_to feature)
|
|
404
|
+
- depends_on (business dependency, not code import)
|
|
405
|
+
|
|
406
|
+
For DOCUMENTATION files: Extract all relationship types:
|
|
407
|
+
- tests, validates, covers (testing relationships)
|
|
408
|
+
- owned_by, maintained_by, assigned_to (ownership)
|
|
409
|
+
- introduced_in, modified_in, removed_in (temporal)
|
|
410
|
+
- related_to, references, duplicates (semantic)
|
|
411
|
+
- navigates_to, shown_on (UI relationships)
|
|
412
|
+
- triggers, depends_on (behavioral - for business logic)
|
|
413
|
+
|
|
414
|
+
DO NOT extract for code files:
|
|
415
|
+
- imports (handled by parser)
|
|
416
|
+
- extends/implements (handled by parser)
|
|
417
|
+
- calls (handled by parser)
|
|
418
|
+
- contains (for code structure - handled by parser)
|
|
419
|
+
|
|
420
|
+
## Output Format:
|
|
421
|
+
Respond with ONLY a JSON array. Use the EXACT entity IDs from the list above:
|
|
422
|
+
[
|
|
423
|
+
{{
|
|
424
|
+
"source_id": "<exact-id-from-list>",
|
|
425
|
+
"relation_type": "<relationship_type>",
|
|
426
|
+
"target_id": "<exact-id-from-list>",
|
|
427
|
+
"confidence": <0.0-1.0>
|
|
428
|
+
}}
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
If no relationships are found, return an empty array: []
|
|
432
|
+
|
|
433
|
+
EXAMPLE: If entities are "Migration Guide (407b9c0c2048)" and "Before State (bc4612fc3d87)",
|
|
434
|
+
a valid relation would be: {{"source_id": "407b9c0c2048", "relation_type": "describes", "target_id": "bc4612fc3d87", "confidence": 0.9}}
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
class DocumentClassifier:
|
|
439
|
+
"""Classifies documents by type using LLM."""
|
|
440
|
+
|
|
441
|
+
def __init__(self, llm: Any):
|
|
442
|
+
self.llm = llm
|
|
443
|
+
self.prompt = ChatPromptTemplate.from_template(DOCUMENT_CLASSIFIER_PROMPT)
|
|
444
|
+
self.parser = JsonOutputParser()
|
|
445
|
+
|
|
446
|
+
def classify(self, document: Document) -> str:
|
|
447
|
+
"""Classify a single document."""
|
|
448
|
+
try:
|
|
449
|
+
content = document.page_content[:3000] # Limit content size
|
|
450
|
+
metadata = json.dumps(document.metadata, default=str)[:500]
|
|
451
|
+
|
|
452
|
+
chain = self.prompt | self.llm | self.parser
|
|
453
|
+
result = chain.invoke({
|
|
454
|
+
"content": content,
|
|
455
|
+
"metadata": metadata
|
|
456
|
+
})
|
|
457
|
+
|
|
458
|
+
return result.get('doc_type', 'other')
|
|
459
|
+
except Exception as e:
|
|
460
|
+
logger.warning(f"Classification failed: {e}")
|
|
461
|
+
return 'other'
|
|
462
|
+
|
|
463
|
+
def classify_batch(self, documents: List[Document]) -> List[str]:
|
|
464
|
+
"""Classify multiple documents."""
|
|
465
|
+
return [self.classify(doc) for doc in documents]
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class EntitySchemaDiscoverer:
|
|
469
|
+
"""Discovers entity and relation schemas from document samples using LLM."""
|
|
470
|
+
|
|
471
|
+
def __init__(self, llm: Any):
|
|
472
|
+
self.llm = llm
|
|
473
|
+
self.prompt = ChatPromptTemplate.from_template(SCHEMA_DISCOVERY_PROMPT)
|
|
474
|
+
self.parser = JsonOutputParser()
|
|
475
|
+
|
|
476
|
+
def discover(self, documents: List[Document]) -> Dict[str, Any]:
|
|
477
|
+
"""
|
|
478
|
+
Discover entity and relation types from document samples.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
documents: Sample documents to analyze
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Schema dictionary with entity_types and relation_types
|
|
485
|
+
"""
|
|
486
|
+
try:
|
|
487
|
+
# Build samples string
|
|
488
|
+
samples_parts = []
|
|
489
|
+
for i, doc in enumerate(documents[:20]): # Limit samples
|
|
490
|
+
content = doc.page_content[:500]
|
|
491
|
+
doc_type = doc.metadata.get('doc_type', 'unknown')
|
|
492
|
+
source = doc.metadata.get('source_toolkit', 'unknown')
|
|
493
|
+
samples_parts.append(f"[Sample {i+1} - {doc_type} from {source}]\n{content}\n")
|
|
494
|
+
|
|
495
|
+
samples = "\n---\n".join(samples_parts)
|
|
496
|
+
|
|
497
|
+
chain = self.prompt | self.llm | self.parser
|
|
498
|
+
result = chain.invoke({"samples": samples})
|
|
499
|
+
|
|
500
|
+
# Validate structure
|
|
501
|
+
if 'entity_types' not in result:
|
|
502
|
+
result['entity_types'] = []
|
|
503
|
+
if 'relation_types' not in result:
|
|
504
|
+
result['relation_types'] = []
|
|
505
|
+
|
|
506
|
+
return result
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.error(f"Schema discovery failed: {e}")
|
|
509
|
+
return self._default_schema()
|
|
510
|
+
|
|
511
|
+
def _default_schema(self) -> Dict[str, Any]:
|
|
512
|
+
"""Return a default schema as fallback."""
|
|
513
|
+
return {
|
|
514
|
+
"entity_types": [
|
|
515
|
+
{"name": "service", "description": "A software service or microservice", "properties": ["name", "description"]},
|
|
516
|
+
{"name": "module", "description": "A code module or package", "properties": ["name", "path"]},
|
|
517
|
+
{"name": "function", "description": "A function or method", "properties": ["name", "signature"]},
|
|
518
|
+
{"name": "api", "description": "An API endpoint", "properties": ["name", "path", "method"]},
|
|
519
|
+
{"name": "feature", "description": "A product feature", "properties": ["name", "description"]},
|
|
520
|
+
{"name": "requirement", "description": "A requirement or user story", "properties": ["name", "description"]},
|
|
521
|
+
],
|
|
522
|
+
"relation_types": [
|
|
523
|
+
{"name": "depends_on", "description": "Dependency relationship", "source_types": ["*"], "target_types": ["*"]},
|
|
524
|
+
{"name": "calls", "description": "Function/API call", "source_types": ["function", "service"], "target_types": ["function", "api"]},
|
|
525
|
+
{"name": "implements", "description": "Implementation relationship", "source_types": ["module", "function"], "target_types": ["feature", "requirement"]},
|
|
526
|
+
{"name": "contains", "description": "Containment relationship", "source_types": ["service", "module"], "target_types": ["module", "function"]},
|
|
527
|
+
]
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
class EntityExtractor:
|
|
532
|
+
"""Extracts entities from documents using LLM."""
|
|
533
|
+
|
|
534
|
+
def __init__(self, llm: Any, embedding: Optional[Any] = None, max_retries: int = 3, retry_delay: float = 2.0):
|
|
535
|
+
self.llm = llm
|
|
536
|
+
self.embedding = embedding
|
|
537
|
+
self.prompt = ChatPromptTemplate.from_template(ENTITY_EXTRACTION_PROMPT)
|
|
538
|
+
self.parser = JsonOutputParser()
|
|
539
|
+
self._entity_cache: Dict[str, Dict] = {}
|
|
540
|
+
self.max_retries = max_retries
|
|
541
|
+
self.retry_delay = retry_delay
|
|
542
|
+
|
|
543
|
+
def extract(
|
|
544
|
+
self,
|
|
545
|
+
document: Document,
|
|
546
|
+
schema: Optional[Dict[str, Any]] = None
|
|
547
|
+
) -> List[Dict[str, Any]]:
|
|
548
|
+
"""
|
|
549
|
+
Extract entities from a single document with retry logic.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
document: Document to extract from
|
|
553
|
+
schema: Optional schema to guide extraction
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
List of extracted entities with line numbers for citations
|
|
557
|
+
"""
|
|
558
|
+
import time
|
|
559
|
+
|
|
560
|
+
file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
|
|
561
|
+
source_toolkit = document.metadata.get('source_toolkit', 'filesystem')
|
|
562
|
+
|
|
563
|
+
last_error = None
|
|
564
|
+
for attempt in range(self.max_retries):
|
|
565
|
+
try:
|
|
566
|
+
content = document.page_content
|
|
567
|
+
|
|
568
|
+
# Add line numbers to content for better extraction
|
|
569
|
+
lines = content.split('\n')
|
|
570
|
+
numbered_content = '\n'.join(
|
|
571
|
+
f"{i+1:4d} | {line}"
|
|
572
|
+
for i, line in enumerate(lines[:200]) # Limit lines
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# Build schema section
|
|
576
|
+
schema_section = ""
|
|
577
|
+
if schema and schema.get('entity_types'):
|
|
578
|
+
types_str = ", ".join([et['name'] for et in schema['entity_types']])
|
|
579
|
+
schema_section = f"Entity types to extract: {types_str}\n"
|
|
580
|
+
for et in schema['entity_types']:
|
|
581
|
+
schema_section += f"- {et['name']}: {et.get('description', '')}\n"
|
|
582
|
+
|
|
583
|
+
chain = self.prompt | self.llm | self.parser
|
|
584
|
+
result = chain.invoke({
|
|
585
|
+
"content": numbered_content,
|
|
586
|
+
"file_path": file_path,
|
|
587
|
+
"source_toolkit": source_toolkit,
|
|
588
|
+
"schema_section": schema_section
|
|
589
|
+
})
|
|
590
|
+
|
|
591
|
+
if not isinstance(result, list):
|
|
592
|
+
result = [result] if result else []
|
|
593
|
+
|
|
594
|
+
# Track total lines in document for boundary checks
|
|
595
|
+
total_lines = len(lines)
|
|
596
|
+
|
|
597
|
+
# Add source tracking and normalize structure
|
|
598
|
+
for entity in result:
|
|
599
|
+
entity['source_toolkit'] = source_toolkit
|
|
600
|
+
entity['file_path'] = file_path
|
|
601
|
+
|
|
602
|
+
# Ensure name is at top level
|
|
603
|
+
if 'name' not in entity and 'properties' in entity:
|
|
604
|
+
entity['name'] = entity['properties'].get('name', entity.get('id', 'unnamed'))
|
|
605
|
+
|
|
606
|
+
# Expand small line ranges to provide meaningful context
|
|
607
|
+
# Minimum span should be 3 lines
|
|
608
|
+
line_start = entity.get('line_start', 1)
|
|
609
|
+
line_end = entity.get('line_end', line_start)
|
|
610
|
+
span = line_end - line_start
|
|
611
|
+
|
|
612
|
+
if span < 2: # Less than 3 lines of context
|
|
613
|
+
# Expand range symmetrically around the center
|
|
614
|
+
center = (line_start + line_end) // 2
|
|
615
|
+
# Add 2 lines on each side (for 5 line minimum)
|
|
616
|
+
new_start = max(1, center - 2)
|
|
617
|
+
new_end = min(total_lines, center + 2)
|
|
618
|
+
entity['line_start'] = new_start
|
|
619
|
+
entity['line_end'] = new_end
|
|
620
|
+
|
|
621
|
+
return result
|
|
622
|
+
|
|
623
|
+
except Exception as e:
|
|
624
|
+
last_error = e
|
|
625
|
+
attempt_num = attempt + 1
|
|
626
|
+
|
|
627
|
+
if attempt_num < self.max_retries:
|
|
628
|
+
delay = 10 * attempt_num
|
|
629
|
+
logger.warning(
|
|
630
|
+
f"Entity extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
|
|
631
|
+
f"Retrying in {delay}s..."
|
|
632
|
+
)
|
|
633
|
+
time.sleep(delay)
|
|
634
|
+
else:
|
|
635
|
+
logger.error(
|
|
636
|
+
f"Entity extraction failed for '{file_path}' after {self.max_retries} attempts: {e}"
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# All retries exhausted - raise exception to signal failure
|
|
640
|
+
raise RuntimeError(
|
|
641
|
+
f"Entity extraction failed for '{file_path}' after {self.max_retries} attempts: {last_error}"
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
def extract_batch(
|
|
645
|
+
self,
|
|
646
|
+
documents: List[Document],
|
|
647
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
648
|
+
skip_on_error: bool = False
|
|
649
|
+
) -> Union[List[Dict[str, Any]], Tuple[List[Dict[str, Any]], List[str]]]:
|
|
650
|
+
"""
|
|
651
|
+
Extract entities from multiple documents with deduplication.
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
documents: List of documents to extract from
|
|
655
|
+
schema: Optional schema to guide extraction
|
|
656
|
+
skip_on_error: If True, skip documents that fail extraction after retries
|
|
657
|
+
and return tuple of (entities, failed_file_paths).
|
|
658
|
+
If False (default), raise exception on first failure.
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
If skip_on_error=False: List of extracted entities
|
|
662
|
+
If skip_on_error=True: Tuple of (entities, failed_file_paths)
|
|
663
|
+
"""
|
|
664
|
+
all_entities = []
|
|
665
|
+
failed_docs = []
|
|
666
|
+
|
|
667
|
+
for doc in documents:
|
|
668
|
+
try:
|
|
669
|
+
entities = self.extract(doc, schema)
|
|
670
|
+
all_entities.extend(entities)
|
|
671
|
+
except RuntimeError as e:
|
|
672
|
+
file_path = doc.metadata.get('file_path', doc.metadata.get('source', 'unknown'))
|
|
673
|
+
if skip_on_error:
|
|
674
|
+
logger.warning(f"Skipping document '{file_path}' due to extraction failure: {e}")
|
|
675
|
+
failed_docs.append(file_path)
|
|
676
|
+
else:
|
|
677
|
+
raise
|
|
678
|
+
|
|
679
|
+
if failed_docs:
|
|
680
|
+
logger.warning(f"Skipped {len(failed_docs)} documents due to extraction failures: {failed_docs[:5]}{'...' if len(failed_docs) > 5 else ''}")
|
|
681
|
+
|
|
682
|
+
# Deduplicate
|
|
683
|
+
deduped = self._deduplicate_entities(all_entities)
|
|
684
|
+
|
|
685
|
+
# Return tuple when skip_on_error is enabled so caller can track failures
|
|
686
|
+
if skip_on_error:
|
|
687
|
+
return deduped, failed_docs
|
|
688
|
+
|
|
689
|
+
return deduped
|
|
690
|
+
|
|
691
|
+
def _deduplicate_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
692
|
+
"""
|
|
693
|
+
Deduplicate entities using simple heuristics.
|
|
694
|
+
|
|
695
|
+
For more complex deduplication, LLM-based approach can be used.
|
|
696
|
+
"""
|
|
697
|
+
seen = {} # (type, normalized_name) -> entity
|
|
698
|
+
deduped = []
|
|
699
|
+
|
|
700
|
+
for entity in entities:
|
|
701
|
+
etype = entity.get('type', 'unknown')
|
|
702
|
+
name = entity.get('properties', {}).get('name', entity.get('id', ''))
|
|
703
|
+
|
|
704
|
+
# Normalize name
|
|
705
|
+
normalized = name.lower().strip().replace('_', ' ').replace('-', ' ')
|
|
706
|
+
key = (etype, normalized)
|
|
707
|
+
|
|
708
|
+
if key in seen:
|
|
709
|
+
# Merge properties
|
|
710
|
+
existing = seen[key]
|
|
711
|
+
for prop_key, prop_value in entity.get('properties', {}).items():
|
|
712
|
+
if prop_key not in existing.get('properties', {}):
|
|
713
|
+
existing.setdefault('properties', {})[prop_key] = prop_value
|
|
714
|
+
else:
|
|
715
|
+
seen[key] = entity
|
|
716
|
+
deduped.append(entity)
|
|
717
|
+
|
|
718
|
+
return deduped
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
class RelationExtractor:
|
|
722
|
+
"""Extracts relationships between entities using LLM."""
|
|
723
|
+
|
|
724
|
+
def __init__(self, llm: Any, max_retries: int = 3, retry_delay: float = 2.0):
|
|
725
|
+
self.llm = llm
|
|
726
|
+
self.prompt = ChatPromptTemplate.from_template(RELATION_EXTRACTION_PROMPT)
|
|
727
|
+
self.parser = JsonOutputParser()
|
|
728
|
+
self.max_retries = max_retries
|
|
729
|
+
self.retry_delay = retry_delay
|
|
730
|
+
|
|
731
|
+
def extract(
|
|
732
|
+
self,
|
|
733
|
+
document: Document,
|
|
734
|
+
entities: List[Dict[str, Any]],
|
|
735
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
736
|
+
confidence_threshold: float = 0.5,
|
|
737
|
+
all_entities: Optional[List[Dict[str, Any]]] = None
|
|
738
|
+
) -> List[Dict[str, Any]]:
|
|
739
|
+
"""
|
|
740
|
+
Extract relationships from a document given known entities with retry logic.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
document: Document to analyze
|
|
744
|
+
entities: Entities known to be in this document (for LLM context)
|
|
745
|
+
schema: Optional schema to guide extraction
|
|
746
|
+
confidence_threshold: Minimum confidence to include
|
|
747
|
+
all_entities: All entities in graph (for ID resolution across sources)
|
|
748
|
+
|
|
749
|
+
Returns:
|
|
750
|
+
List of extracted relations
|
|
751
|
+
"""
|
|
752
|
+
import time
|
|
753
|
+
|
|
754
|
+
if not entities:
|
|
755
|
+
return []
|
|
756
|
+
|
|
757
|
+
# Use all_entities for ID resolution if provided, otherwise just doc entities
|
|
758
|
+
entities_for_lookup = all_entities if all_entities else entities
|
|
759
|
+
|
|
760
|
+
file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
|
|
761
|
+
last_error = None
|
|
762
|
+
|
|
763
|
+
for attempt in range(self.max_retries):
|
|
764
|
+
try:
|
|
765
|
+
content = document.page_content[:4000]
|
|
766
|
+
|
|
767
|
+
# Filter entities from this document
|
|
768
|
+
doc_id = document.metadata.get('source')
|
|
769
|
+
doc_entities = [e for e in entities if e.get('source_doc_id') == doc_id]
|
|
770
|
+
|
|
771
|
+
if not doc_entities:
|
|
772
|
+
doc_entities = entities[:20] # Fall back to first N entities
|
|
773
|
+
|
|
774
|
+
# Format entities with ID first for clarity: "ID -> Name (type)"
|
|
775
|
+
entities_list = "\n".join([
|
|
776
|
+
f"- {e.get('id')} -> {e.get('name', 'unnamed')} ({e.get('type', 'unknown')})"
|
|
777
|
+
for e in doc_entities[:30]
|
|
778
|
+
])
|
|
779
|
+
|
|
780
|
+
# Build schema section
|
|
781
|
+
schema_section = ""
|
|
782
|
+
if schema and schema.get('relation_types'):
|
|
783
|
+
types_str = ", ".join([rt['name'] for rt in schema['relation_types']])
|
|
784
|
+
schema_section = f"## Relationship types: {types_str}\n"
|
|
785
|
+
for rt in schema['relation_types']:
|
|
786
|
+
schema_section += f"- {rt['name']}: {rt.get('description', '')}\n"
|
|
787
|
+
|
|
788
|
+
chain = self.prompt | self.llm | self.parser
|
|
789
|
+
result = chain.invoke({
|
|
790
|
+
"content": content,
|
|
791
|
+
"entities_list": entities_list,
|
|
792
|
+
"schema_section": schema_section
|
|
793
|
+
})
|
|
794
|
+
|
|
795
|
+
if not isinstance(result, list):
|
|
796
|
+
result = [result] if result else []
|
|
797
|
+
|
|
798
|
+
# Build lookup tables from ALL entities (enables cross-source resolution)
|
|
799
|
+
# LLMs often use names instead of hex IDs, so we map both
|
|
800
|
+
id_lookup = {}
|
|
801
|
+
name_to_id = {} # For fuzzy matching fallback
|
|
802
|
+
|
|
803
|
+
for e in entities_for_lookup:
|
|
804
|
+
entity_id = e.get('id', '')
|
|
805
|
+
entity_name = e.get('name', '')
|
|
806
|
+
entity_type = e.get('type', '')
|
|
807
|
+
|
|
808
|
+
if not entity_id:
|
|
809
|
+
continue
|
|
810
|
+
|
|
811
|
+
# Direct ID match
|
|
812
|
+
id_lookup[entity_id] = entity_id
|
|
813
|
+
id_lookup[entity_id.lower()] = entity_id
|
|
814
|
+
|
|
815
|
+
# Name-based lookups (what LLM often returns)
|
|
816
|
+
if entity_name:
|
|
817
|
+
# Exact name
|
|
818
|
+
id_lookup[entity_name] = entity_id
|
|
819
|
+
id_lookup[entity_name.lower()] = entity_id
|
|
820
|
+
# snake_case version
|
|
821
|
+
snake_name = entity_name.lower().replace(' ', '_').replace('-', '_').replace(':', '_')
|
|
822
|
+
id_lookup[snake_name] = entity_id
|
|
823
|
+
# Remove articles/filler words for matching
|
|
824
|
+
short_snake = snake_name.replace('_a_', '_').replace('_an_', '_').replace('_the_', '_').replace('_your_', '_').replace('_my_', '_')
|
|
825
|
+
id_lookup[short_snake] = entity_id
|
|
826
|
+
# type:name format
|
|
827
|
+
type_name = f"{entity_type}:{snake_name}"
|
|
828
|
+
id_lookup[type_name] = entity_id
|
|
829
|
+
id_lookup[type_name.lower()] = entity_id
|
|
830
|
+
# Store for fuzzy matching with word sets
|
|
831
|
+
words = set(snake_name.split('_'))
|
|
832
|
+
name_to_id[snake_name] = (entity_id, words)
|
|
833
|
+
|
|
834
|
+
def resolve_id(ref: str) -> Optional[str]:
|
|
835
|
+
"""Resolve an entity reference to its actual ID."""
|
|
836
|
+
if not ref:
|
|
837
|
+
return None
|
|
838
|
+
# Direct lookup
|
|
839
|
+
if ref in id_lookup:
|
|
840
|
+
return id_lookup[ref]
|
|
841
|
+
ref_lower = ref.lower()
|
|
842
|
+
if ref_lower in id_lookup:
|
|
843
|
+
return id_lookup[ref_lower]
|
|
844
|
+
# Snake case the reference
|
|
845
|
+
ref_snake = ref_lower.replace(' ', '_').replace('-', '_').replace(':', '_')
|
|
846
|
+
if ref_snake in id_lookup:
|
|
847
|
+
return id_lookup[ref_snake]
|
|
848
|
+
|
|
849
|
+
# Fuzzy matching: substring or word overlap
|
|
850
|
+
ref_words = set(ref_snake.split('_'))
|
|
851
|
+
best_match = None
|
|
852
|
+
best_score = 0
|
|
853
|
+
|
|
854
|
+
for name, (eid, name_words) in name_to_id.items():
|
|
855
|
+
# Substring match
|
|
856
|
+
if ref_snake in name or name in ref_snake:
|
|
857
|
+
return eid
|
|
858
|
+
|
|
859
|
+
# Word overlap score
|
|
860
|
+
overlap = len(ref_words & name_words)
|
|
861
|
+
if overlap >= 2 and overlap > best_score:
|
|
862
|
+
# At least 2 words must match
|
|
863
|
+
best_score = overlap
|
|
864
|
+
best_match = eid
|
|
865
|
+
|
|
866
|
+
return best_match
|
|
867
|
+
|
|
868
|
+
# Resolve relations to actual entity IDs
|
|
869
|
+
resolved = []
|
|
870
|
+
logger.info(f"Relation extraction got {len(result)} raw relations from LLM")
|
|
871
|
+
logger.info(f"ID lookup has {len(id_lookup)} entries, name_to_id has {len(name_to_id)} entries")
|
|
872
|
+
|
|
873
|
+
for r in result:
|
|
874
|
+
source = r.get('source_id', '')
|
|
875
|
+
target = r.get('target_id', '')
|
|
876
|
+
|
|
877
|
+
# Try to resolve source and target
|
|
878
|
+
resolved_source = resolve_id(source)
|
|
879
|
+
resolved_target = resolve_id(target)
|
|
880
|
+
|
|
881
|
+
logger.debug(f"Resolving: {source} -> {resolved_source}, {target} -> {resolved_target}")
|
|
882
|
+
|
|
883
|
+
if resolved_source and resolved_target:
|
|
884
|
+
r['source_id'] = resolved_source
|
|
885
|
+
r['target_id'] = resolved_target
|
|
886
|
+
resolved.append(r)
|
|
887
|
+
else:
|
|
888
|
+
logger.warning(f"Could not resolve relation: {source} ({resolved_source}) -> {target} ({resolved_target})")
|
|
889
|
+
|
|
890
|
+
logger.info(f"Resolved {len(resolved)} relations successfully")
|
|
891
|
+
|
|
892
|
+
# Filter by confidence
|
|
893
|
+
filtered = [
|
|
894
|
+
r for r in resolved
|
|
895
|
+
if r.get('confidence', 0) >= confidence_threshold
|
|
896
|
+
]
|
|
897
|
+
|
|
898
|
+
return filtered
|
|
899
|
+
|
|
900
|
+
except Exception as e:
|
|
901
|
+
last_error = e
|
|
902
|
+
attempt_num = attempt + 1
|
|
903
|
+
|
|
904
|
+
if attempt_num < self.max_retries:
|
|
905
|
+
# Exponential backoff: 10^0=1s, 10^1=10s, 10^2=100s
|
|
906
|
+
delay = 10 ** attempt
|
|
907
|
+
logger.warning(
|
|
908
|
+
f"Relation extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
|
|
909
|
+
f"Retrying in {delay}s..."
|
|
910
|
+
)
|
|
911
|
+
time.sleep(delay)
|
|
912
|
+
else:
|
|
913
|
+
logger.warning(
|
|
914
|
+
f"Relation extraction failed for '{file_path}' after {self.max_retries} attempts: {e}. Skipping."
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
# Return empty list on failure (relations are optional)
|
|
918
|
+
return []
|
|
919
|
+
|
|
920
|
+
def extract_batch(
|
|
921
|
+
self,
|
|
922
|
+
documents: List[Document],
|
|
923
|
+
entities: List[Dict[str, Any]],
|
|
924
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
925
|
+
confidence_threshold: float = 0.5
|
|
926
|
+
) -> List[Dict[str, Any]]:
|
|
927
|
+
"""Extract relations from multiple documents."""
|
|
928
|
+
all_relations = []
|
|
929
|
+
|
|
930
|
+
for doc in documents:
|
|
931
|
+
relations = self.extract(
|
|
932
|
+
doc,
|
|
933
|
+
entities,
|
|
934
|
+
schema=schema,
|
|
935
|
+
confidence_threshold=confidence_threshold
|
|
936
|
+
)
|
|
937
|
+
all_relations.extend(relations)
|
|
938
|
+
|
|
939
|
+
# Deduplicate relations
|
|
940
|
+
return self._deduplicate_relations(all_relations)
|
|
941
|
+
|
|
942
|
+
def _deduplicate_relations(self, relations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
943
|
+
"""Deduplicate relations by source-type-target key."""
|
|
944
|
+
seen = {}
|
|
945
|
+
deduped = []
|
|
946
|
+
|
|
947
|
+
for rel in relations:
|
|
948
|
+
key = (
|
|
949
|
+
rel.get('source_id'),
|
|
950
|
+
rel.get('relation_type'),
|
|
951
|
+
rel.get('target_id')
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
if key not in seen:
|
|
955
|
+
seen[key] = rel
|
|
956
|
+
deduped.append(rel)
|
|
957
|
+
else:
|
|
958
|
+
# Keep higher confidence
|
|
959
|
+
if rel.get('confidence', 0) > seen[key].get('confidence', 0):
|
|
960
|
+
seen[key] = rel
|
|
961
|
+
# Update in deduped list
|
|
962
|
+
for i, r in enumerate(deduped):
|
|
963
|
+
if (r.get('source_id'), r.get('relation_type'), r.get('target_id')) == key:
|
|
964
|
+
deduped[i] = rel
|
|
965
|
+
break
|
|
966
|
+
|
|
967
|
+
return deduped
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
# ============================================================================
|
|
971
|
+
# FACT EXTRACTION (for non-code files)
|
|
972
|
+
# ============================================================================
|
|
973
|
+
|
|
974
|
+
FACT_EXTRACTION_PROMPT = """Extract factual information from the following document.
|
|
975
|
+
|
|
976
|
+
## Document content:
|
|
977
|
+
---
|
|
978
|
+
{content}
|
|
979
|
+
---
|
|
980
|
+
|
|
981
|
+
## Document metadata:
|
|
982
|
+
- File: {file_path}
|
|
983
|
+
- Source: {source_toolkit}
|
|
984
|
+
|
|
985
|
+
## Canonical Fact Types:
|
|
986
|
+
Extract facts using these canonical types:
|
|
987
|
+
|
|
988
|
+
1. **decision** - Architectural or business decisions
|
|
989
|
+
- Properties: title, rationale, alternatives, outcome, date, stakeholders
|
|
990
|
+
- Example: "We chose PostgreSQL over MongoDB for transactional consistency"
|
|
991
|
+
|
|
992
|
+
2. **requirement** - Functional or non-functional requirements
|
|
993
|
+
- Properties: title, description, priority, status, acceptance_criteria
|
|
994
|
+
- Example: "System must handle 1000 concurrent users"
|
|
995
|
+
|
|
996
|
+
3. **definition** - Definitions of terms, concepts, or standards
|
|
997
|
+
- Properties: term, definition, context, synonyms
|
|
998
|
+
- Example: "A 'tenant' refers to an organization using our SaaS platform"
|
|
999
|
+
|
|
1000
|
+
4. **date** - Important dates, deadlines, milestones
|
|
1001
|
+
- Properties: event, date, description, status
|
|
1002
|
+
- Example: "MVP launch scheduled for Q2 2024"
|
|
1003
|
+
|
|
1004
|
+
5. **reference** - External references, links, citations
|
|
1005
|
+
- Properties: title, url, description, type (spec, documentation, standard)
|
|
1006
|
+
- Example: "OAuth 2.0 specification: RFC 6749"
|
|
1007
|
+
|
|
1008
|
+
6. **contact** - People, teams, ownership information
|
|
1009
|
+
- Properties: name, role, team, email, responsibilities
|
|
1010
|
+
- Example: "John Smith is the product owner for the payments module"
|
|
1011
|
+
|
|
1012
|
+
## Instructions:
|
|
1013
|
+
1. Identify facts from the document that match the canonical types
|
|
1014
|
+
2. Each fact must have a unique ID (short hash-like string)
|
|
1015
|
+
3. Include confidence score (0.0-1.0) based on how explicit the fact is
|
|
1016
|
+
4. Include line_start and line_end for citation support
|
|
1017
|
+
|
|
1018
|
+
## Output Format:
|
|
1019
|
+
Respond with ONLY a JSON array:
|
|
1020
|
+
[
|
|
1021
|
+
{{
|
|
1022
|
+
"id": "<unique_id>",
|
|
1023
|
+
"fact_type": "<decision|requirement|definition|date|reference|contact>",
|
|
1024
|
+
"title": "<brief title>",
|
|
1025
|
+
"properties": {{
|
|
1026
|
+
"<property_name>": "<value>"
|
|
1027
|
+
}},
|
|
1028
|
+
"line_start": <line_number>,
|
|
1029
|
+
"line_end": <line_number>,
|
|
1030
|
+
"confidence": <0.0-1.0>
|
|
1031
|
+
}}
|
|
1032
|
+
]
|
|
1033
|
+
|
|
1034
|
+
If no facts are found, return an empty array: []
|
|
1035
|
+
"""
|
|
1036
|
+
|
|
1037
|
+
# Canonical fact types for validation
|
|
1038
|
+
CANONICAL_FACT_TYPES = {
|
|
1039
|
+
"decision": {
|
|
1040
|
+
"description": "Architectural or business decisions",
|
|
1041
|
+
"properties": ["title", "rationale", "alternatives", "outcome", "date", "stakeholders"]
|
|
1042
|
+
},
|
|
1043
|
+
"requirement": {
|
|
1044
|
+
"description": "Functional or non-functional requirements",
|
|
1045
|
+
"properties": ["title", "description", "priority", "status", "acceptance_criteria"]
|
|
1046
|
+
},
|
|
1047
|
+
"definition": {
|
|
1048
|
+
"description": "Definitions of terms, concepts, or standards",
|
|
1049
|
+
"properties": ["term", "definition", "context", "synonyms"]
|
|
1050
|
+
},
|
|
1051
|
+
"date": {
|
|
1052
|
+
"description": "Important dates, deadlines, milestones",
|
|
1053
|
+
"properties": ["event", "date", "description", "status"]
|
|
1054
|
+
},
|
|
1055
|
+
"reference": {
|
|
1056
|
+
"description": "External references, links, citations",
|
|
1057
|
+
"properties": ["title", "url", "description", "type"]
|
|
1058
|
+
},
|
|
1059
|
+
"contact": {
|
|
1060
|
+
"description": "People, teams, ownership information",
|
|
1061
|
+
"properties": ["name", "role", "team", "email", "responsibilities"]
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
# Code-specific fact types for semantic code understanding
|
|
1066
|
+
CODE_FACT_TYPES = {
|
|
1067
|
+
"algorithm": {
|
|
1068
|
+
"description": "Algorithm or pattern used in the code",
|
|
1069
|
+
"properties": ["name", "description", "complexity", "use_case"]
|
|
1070
|
+
},
|
|
1071
|
+
"behavior": {
|
|
1072
|
+
"description": "What the code does - actions, side effects, I/O",
|
|
1073
|
+
"properties": ["action", "description", "inputs", "outputs", "side_effects"]
|
|
1074
|
+
},
|
|
1075
|
+
"validation": {
|
|
1076
|
+
"description": "Input validation, checks, guards, assertions",
|
|
1077
|
+
"properties": ["what", "how", "error_handling"]
|
|
1078
|
+
},
|
|
1079
|
+
"dependency": {
|
|
1080
|
+
"description": "External service calls, API usage, library dependencies",
|
|
1081
|
+
"properties": ["service", "operation", "purpose", "error_handling"]
|
|
1082
|
+
},
|
|
1083
|
+
"error_handling": {
|
|
1084
|
+
"description": "How errors are handled - retry, fallback, logging",
|
|
1085
|
+
"properties": ["strategy", "description", "fallback", "logging"]
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
CODE_FACT_EXTRACTION_PROMPT = """Analyze the following code and extract semantic facts about what it does.
|
|
1090
|
+
|
|
1091
|
+
Focus on understanding the CODE'S PURPOSE AND BEHAVIOR, not its structure.
|
|
1092
|
+
Parsers already extract structure (classes, functions, imports). You extract MEANING.
|
|
1093
|
+
|
|
1094
|
+
## Code content (with line numbers):
|
|
1095
|
+
---
|
|
1096
|
+
{content}
|
|
1097
|
+
---
|
|
1098
|
+
|
|
1099
|
+
Source file: {file_path}
|
|
1100
|
+
|
|
1101
|
+
## Fact types to extract:
|
|
1102
|
+
1. **algorithm** - What algorithm or design pattern is used?
|
|
1103
|
+
- Example: "Uses binary search for O(log n) lookup"
|
|
1104
|
+
- Example: "Implements observer pattern for event handling"
|
|
1105
|
+
|
|
1106
|
+
2. **behavior** - What does this code DO? (actions, I/O, side effects)
|
|
1107
|
+
- Example: "Sends email notification when payment fails"
|
|
1108
|
+
- Example: "Writes audit log for every database mutation"
|
|
1109
|
+
- Example: "Caches API responses for 5 minutes"
|
|
1110
|
+
|
|
1111
|
+
3. **validation** - What validation or checks are performed?
|
|
1112
|
+
- Example: "Validates email format using RFC 5322 regex"
|
|
1113
|
+
- Example: "Checks user permissions before allowing access"
|
|
1114
|
+
|
|
1115
|
+
4. **dependency** - What external services or APIs are called?
|
|
1116
|
+
- Example: "Calls Stripe API for payment processing"
|
|
1117
|
+
- Example: "Queries PostgreSQL for user data"
|
|
1118
|
+
|
|
1119
|
+
5. **error_handling** - How are errors handled?
|
|
1120
|
+
- Example: "Retries failed requests 3 times with exponential backoff"
|
|
1121
|
+
- Example: "Falls back to cached data when API is unavailable"
|
|
1122
|
+
|
|
1123
|
+
## Instructions:
|
|
1124
|
+
- Extract 1-5 most important facts about what this code DOES
|
|
1125
|
+
- Focus on business logic and behavior, not syntax
|
|
1126
|
+
- Include line numbers where the behavior is implemented
|
|
1127
|
+
- Be specific and actionable
|
|
1128
|
+
|
|
1129
|
+
## Output Format:
|
|
1130
|
+
Respond with ONLY a JSON array:
|
|
1131
|
+
[
|
|
1132
|
+
{{
|
|
1133
|
+
"fact_type": "<algorithm|behavior|validation|dependency|error_handling>",
|
|
1134
|
+
"subject": "<what is being described>",
|
|
1135
|
+
"predicate": "<action or relationship>",
|
|
1136
|
+
"object": "<target or outcome>",
|
|
1137
|
+
"line_start": <start_line>,
|
|
1138
|
+
"line_end": <end_line>,
|
|
1139
|
+
"confidence": <0.0-1.0>
|
|
1140
|
+
}}
|
|
1141
|
+
]
|
|
1142
|
+
|
|
1143
|
+
If no meaningful facts can be extracted, return an empty array: []
|
|
1144
|
+
"""
|
|
1145
|
+
|
|
1146
|
+
|
|
1147
|
+
class FactExtractor:
|
|
1148
|
+
"""
|
|
1149
|
+
Extracts structured facts from documents using lightweight LLM.
|
|
1150
|
+
|
|
1151
|
+
Two extraction modes:
|
|
1152
|
+
- extract(): For text/docs - extracts decisions, requirements, definitions, etc.
|
|
1153
|
+
- extract_code(): For code - extracts algorithms, behaviors, validations, etc.
|
|
1154
|
+
"""
|
|
1155
|
+
|
|
1156
|
+
def __init__(self, llm: Any, max_retries: int = 3, retry_delay: float = 2.0):
|
|
1157
|
+
self.llm = llm
|
|
1158
|
+
self.prompt = ChatPromptTemplate.from_template(FACT_EXTRACTION_PROMPT)
|
|
1159
|
+
self.code_prompt = ChatPromptTemplate.from_template(CODE_FACT_EXTRACTION_PROMPT)
|
|
1160
|
+
self.parser = JsonOutputParser()
|
|
1161
|
+
self.max_retries = max_retries
|
|
1162
|
+
self.retry_delay = retry_delay
|
|
1163
|
+
|
|
1164
|
+
def extract(
|
|
1165
|
+
self,
|
|
1166
|
+
document: Document,
|
|
1167
|
+
fact_types: Optional[List[str]] = None
|
|
1168
|
+
) -> List[Dict[str, Any]]:
|
|
1169
|
+
"""
|
|
1170
|
+
Extract facts from a single document.
|
|
1171
|
+
|
|
1172
|
+
Args:
|
|
1173
|
+
document: Document to extract from
|
|
1174
|
+
fact_types: Optional filter for specific fact types
|
|
1175
|
+
|
|
1176
|
+
Returns:
|
|
1177
|
+
List of extracted facts
|
|
1178
|
+
"""
|
|
1179
|
+
import time
|
|
1180
|
+
|
|
1181
|
+
file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
|
|
1182
|
+
source_toolkit = document.metadata.get('source_toolkit', 'filesystem')
|
|
1183
|
+
|
|
1184
|
+
last_error = None
|
|
1185
|
+
for attempt in range(self.max_retries):
|
|
1186
|
+
try:
|
|
1187
|
+
content = document.page_content
|
|
1188
|
+
|
|
1189
|
+
# Add line numbers for citation support
|
|
1190
|
+
lines = content.split('\n')
|
|
1191
|
+
numbered_content = '\n'.join(
|
|
1192
|
+
f"{i+1:4d} | {line}"
|
|
1193
|
+
for i, line in enumerate(lines[:300]) # Limit for LLM context
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
chain = self.prompt | self.llm | self.parser
|
|
1197
|
+
result = chain.invoke({
|
|
1198
|
+
"content": numbered_content,
|
|
1199
|
+
"file_path": file_path,
|
|
1200
|
+
"source_toolkit": source_toolkit
|
|
1201
|
+
})
|
|
1202
|
+
|
|
1203
|
+
if not isinstance(result, list):
|
|
1204
|
+
result = [result] if result else []
|
|
1205
|
+
|
|
1206
|
+
# Validate and enrich facts
|
|
1207
|
+
validated_facts = []
|
|
1208
|
+
for fact in result:
|
|
1209
|
+
fact_type = fact.get('fact_type', '').lower()
|
|
1210
|
+
|
|
1211
|
+
# Skip if not a canonical type
|
|
1212
|
+
if fact_type not in CANONICAL_FACT_TYPES:
|
|
1213
|
+
logger.warning(f"Skipping non-canonical fact type: {fact_type}")
|
|
1214
|
+
continue
|
|
1215
|
+
|
|
1216
|
+
# Filter by requested types if specified
|
|
1217
|
+
if fact_types and fact_type not in fact_types:
|
|
1218
|
+
continue
|
|
1219
|
+
|
|
1220
|
+
# Ensure required fields
|
|
1221
|
+
if 'id' not in fact:
|
|
1222
|
+
fact['id'] = hashlib.md5(
|
|
1223
|
+
f"{file_path}:{fact.get('title', '')}:{fact_type}".encode()
|
|
1224
|
+
).hexdigest()[:12]
|
|
1225
|
+
|
|
1226
|
+
# Add source tracking
|
|
1227
|
+
fact['source_toolkit'] = source_toolkit
|
|
1228
|
+
fact['file_path'] = file_path
|
|
1229
|
+
fact['source'] = 'fact_extractor'
|
|
1230
|
+
|
|
1231
|
+
# Normalize confidence
|
|
1232
|
+
if 'confidence' not in fact:
|
|
1233
|
+
fact['confidence'] = 0.7
|
|
1234
|
+
|
|
1235
|
+
validated_facts.append(fact)
|
|
1236
|
+
|
|
1237
|
+
return validated_facts
|
|
1238
|
+
|
|
1239
|
+
except Exception as e:
|
|
1240
|
+
last_error = e
|
|
1241
|
+
attempt_num = attempt + 1
|
|
1242
|
+
|
|
1243
|
+
if attempt_num < self.max_retries:
|
|
1244
|
+
delay = 10 * attempt_num
|
|
1245
|
+
logger.warning(
|
|
1246
|
+
f"Fact extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
|
|
1247
|
+
f"Retrying in {delay}s..."
|
|
1248
|
+
)
|
|
1249
|
+
time.sleep(delay)
|
|
1250
|
+
else:
|
|
1251
|
+
logger.warning(
|
|
1252
|
+
f"Fact extraction failed for '{file_path}' after {self.max_retries} attempts: {e}. Skipping."
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
return []
|
|
1256
|
+
|
|
1257
|
+
def extract_batch(
|
|
1258
|
+
self,
|
|
1259
|
+
documents: List[Document],
|
|
1260
|
+
fact_types: Optional[List[str]] = None
|
|
1261
|
+
) -> List[Dict[str, Any]]:
|
|
1262
|
+
"""
|
|
1263
|
+
Extract facts from multiple documents with deduplication.
|
|
1264
|
+
|
|
1265
|
+
Args:
|
|
1266
|
+
documents: List of documents to extract from
|
|
1267
|
+
fact_types: Optional filter for specific fact types
|
|
1268
|
+
|
|
1269
|
+
Returns:
|
|
1270
|
+
List of extracted facts (deduplicated)
|
|
1271
|
+
"""
|
|
1272
|
+
all_facts = []
|
|
1273
|
+
|
|
1274
|
+
for doc in documents:
|
|
1275
|
+
facts = self.extract(doc, fact_types)
|
|
1276
|
+
all_facts.extend(facts)
|
|
1277
|
+
|
|
1278
|
+
return self._deduplicate_facts(all_facts)
|
|
1279
|
+
|
|
1280
|
+
def extract_code(
|
|
1281
|
+
self,
|
|
1282
|
+
document: Document,
|
|
1283
|
+
fact_types: Optional[List[str]] = None
|
|
1284
|
+
) -> List[Dict[str, Any]]:
|
|
1285
|
+
"""
|
|
1286
|
+
Extract semantic facts from code - what the code DOES.
|
|
1287
|
+
|
|
1288
|
+
Args:
|
|
1289
|
+
document: Code document to extract from
|
|
1290
|
+
fact_types: Optional filter for specific fact types
|
|
1291
|
+
|
|
1292
|
+
Returns:
|
|
1293
|
+
List of extracted code facts
|
|
1294
|
+
"""
|
|
1295
|
+
import time
|
|
1296
|
+
|
|
1297
|
+
file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
|
|
1298
|
+
source_toolkit = document.metadata.get('source_toolkit', 'filesystem')
|
|
1299
|
+
|
|
1300
|
+
last_error = None
|
|
1301
|
+
for attempt in range(self.max_retries):
|
|
1302
|
+
try:
|
|
1303
|
+
content = document.page_content
|
|
1304
|
+
|
|
1305
|
+
# Add line numbers for citation support
|
|
1306
|
+
lines = content.split('\n')
|
|
1307
|
+
numbered_content = '\n'.join(
|
|
1308
|
+
f"{i+1:4d} | {line}"
|
|
1309
|
+
for i, line in enumerate(lines[:200]) # Limit for LLM context
|
|
1310
|
+
)
|
|
1311
|
+
|
|
1312
|
+
chain = self.code_prompt | self.llm | self.parser
|
|
1313
|
+
result = chain.invoke({
|
|
1314
|
+
"content": numbered_content,
|
|
1315
|
+
"file_path": file_path
|
|
1316
|
+
})
|
|
1317
|
+
|
|
1318
|
+
if not isinstance(result, list):
|
|
1319
|
+
result = [result] if result else []
|
|
1320
|
+
|
|
1321
|
+
# Validate and enrich facts
|
|
1322
|
+
validated_facts = []
|
|
1323
|
+
for fact in result:
|
|
1324
|
+
fact_type = fact.get('fact_type', '').lower()
|
|
1325
|
+
|
|
1326
|
+
# Skip if not a canonical code fact type
|
|
1327
|
+
if fact_type not in CODE_FACT_TYPES:
|
|
1328
|
+
logger.warning(f"Skipping non-canonical code fact type: {fact_type}")
|
|
1329
|
+
continue
|
|
1330
|
+
|
|
1331
|
+
# Filter by requested types if specified
|
|
1332
|
+
if fact_types and fact_type not in fact_types:
|
|
1333
|
+
continue
|
|
1334
|
+
|
|
1335
|
+
# Ensure required fields
|
|
1336
|
+
if 'id' not in fact:
|
|
1337
|
+
fact['id'] = hashlib.md5(
|
|
1338
|
+
f"{file_path}:{fact.get('subject', '')}:{fact_type}".encode()
|
|
1339
|
+
).hexdigest()[:12]
|
|
1340
|
+
|
|
1341
|
+
# Add source tracking
|
|
1342
|
+
fact['source_toolkit'] = source_toolkit
|
|
1343
|
+
fact['file_path'] = file_path
|
|
1344
|
+
fact['source'] = 'code_fact_extractor'
|
|
1345
|
+
|
|
1346
|
+
# Normalize confidence
|
|
1347
|
+
if 'confidence' not in fact:
|
|
1348
|
+
fact['confidence'] = 0.7
|
|
1349
|
+
|
|
1350
|
+
validated_facts.append(fact)
|
|
1351
|
+
|
|
1352
|
+
return validated_facts
|
|
1353
|
+
|
|
1354
|
+
except Exception as e:
|
|
1355
|
+
last_error = e
|
|
1356
|
+
attempt_num = attempt + 1
|
|
1357
|
+
|
|
1358
|
+
if attempt_num < self.max_retries:
|
|
1359
|
+
delay = 10 * attempt_num
|
|
1360
|
+
logger.warning(
|
|
1361
|
+
f"Code fact extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
|
|
1362
|
+
f"Retrying in {delay}s..."
|
|
1363
|
+
)
|
|
1364
|
+
time.sleep(delay)
|
|
1365
|
+
else:
|
|
1366
|
+
logger.warning(
|
|
1367
|
+
f"Code fact extraction failed for '{file_path}' after {self.max_retries} attempts: {e}. Skipping."
|
|
1368
|
+
)
|
|
1369
|
+
|
|
1370
|
+
return []
|
|
1371
|
+
|
|
1372
|
+
def extract_batch_code(
|
|
1373
|
+
self,
|
|
1374
|
+
documents: List[Document],
|
|
1375
|
+
fact_types: Optional[List[str]] = None
|
|
1376
|
+
) -> List[Dict[str, Any]]:
|
|
1377
|
+
"""
|
|
1378
|
+
Extract facts from multiple code documents with deduplication.
|
|
1379
|
+
|
|
1380
|
+
This method processes code documents using the code-specific prompt
|
|
1381
|
+
and fact types (algorithm, behavior, validation, etc.).
|
|
1382
|
+
|
|
1383
|
+
Args:
|
|
1384
|
+
documents: List of code documents to extract from
|
|
1385
|
+
fact_types: Optional filter for specific fact types
|
|
1386
|
+
|
|
1387
|
+
Returns:
|
|
1388
|
+
List of extracted facts (deduplicated)
|
|
1389
|
+
"""
|
|
1390
|
+
all_facts = []
|
|
1391
|
+
|
|
1392
|
+
for doc in documents:
|
|
1393
|
+
facts = self.extract_code(doc, fact_types)
|
|
1394
|
+
all_facts.extend(facts)
|
|
1395
|
+
|
|
1396
|
+
return self._deduplicate_facts(all_facts)
|
|
1397
|
+
|
|
1398
|
+
def _is_code_file(self, file_path: str) -> bool:
|
|
1399
|
+
"""Check if file is a code file that should use parsers instead."""
|
|
1400
|
+
code_extensions = {
|
|
1401
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.kt', '.cs',
|
|
1402
|
+
'.go', '.rs', '.swift', '.c', '.cpp', '.h', '.hpp', '.rb',
|
|
1403
|
+
'.php', '.scala', '.clj', '.ex', '.exs', '.erl', '.hs'
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1406
|
+
if not file_path:
|
|
1407
|
+
return False
|
|
1408
|
+
|
|
1409
|
+
import os
|
|
1410
|
+
_, ext = os.path.splitext(file_path.lower())
|
|
1411
|
+
return ext in code_extensions
|
|
1412
|
+
|
|
1413
|
+
def _deduplicate_facts(self, facts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1414
|
+
"""
|
|
1415
|
+
Deduplicate facts using title and type similarity.
|
|
1416
|
+
|
|
1417
|
+
File-level deduplication: same fact type + similar title = duplicate.
|
|
1418
|
+
"""
|
|
1419
|
+
seen = {} # (fact_type, normalized_title) -> fact
|
|
1420
|
+
deduped = []
|
|
1421
|
+
|
|
1422
|
+
for fact in facts:
|
|
1423
|
+
fact_type = fact.get('fact_type', 'unknown')
|
|
1424
|
+
title = fact.get('title', fact.get('id', ''))
|
|
1425
|
+
|
|
1426
|
+
# Normalize title for comparison
|
|
1427
|
+
normalized = title.lower().strip()
|
|
1428
|
+
# Remove common stop words for better matching
|
|
1429
|
+
for word in ['the', 'a', 'an', 'is', 'are', 'was', 'were']:
|
|
1430
|
+
normalized = normalized.replace(f' {word} ', ' ')
|
|
1431
|
+
normalized = ' '.join(normalized.split()) # Collapse whitespace
|
|
1432
|
+
|
|
1433
|
+
key = (fact_type, normalized)
|
|
1434
|
+
|
|
1435
|
+
if key in seen:
|
|
1436
|
+
# Merge properties, keep higher confidence
|
|
1437
|
+
existing = seen[key]
|
|
1438
|
+
if fact.get('confidence', 0) > existing.get('confidence', 0):
|
|
1439
|
+
# Replace with higher confidence version
|
|
1440
|
+
for prop_key, prop_value in existing.get('properties', {}).items():
|
|
1441
|
+
if prop_key not in fact.get('properties', {}):
|
|
1442
|
+
fact.setdefault('properties', {})[prop_key] = prop_value
|
|
1443
|
+
seen[key] = fact
|
|
1444
|
+
# Update in deduped list
|
|
1445
|
+
for i, f in enumerate(deduped):
|
|
1446
|
+
if (f.get('fact_type'), f.get('title', '').lower().strip()) == key:
|
|
1447
|
+
deduped[i] = fact
|
|
1448
|
+
break
|
|
1449
|
+
else:
|
|
1450
|
+
# Merge new properties into existing
|
|
1451
|
+
for prop_key, prop_value in fact.get('properties', {}).items():
|
|
1452
|
+
if prop_key not in existing.get('properties', {}):
|
|
1453
|
+
existing.setdefault('properties', {})[prop_key] = prop_value
|
|
1454
|
+
else:
|
|
1455
|
+
seen[key] = fact
|
|
1456
|
+
deduped.append(fact)
|
|
1457
|
+
|
|
1458
|
+
return deduped
|
|
1459
|
+
|
|
1460
|
+
def get_fact_type_info(self, fact_type: str, code: bool = False) -> Optional[Dict[str, Any]]:
|
|
1461
|
+
"""Get information about a canonical fact type."""
|
|
1462
|
+
types = CODE_FACT_TYPES if code else CANONICAL_FACT_TYPES
|
|
1463
|
+
return types.get(fact_type)
|
|
1464
|
+
|
|
1465
|
+
@staticmethod
|
|
1466
|
+
def get_canonical_types(code: bool = False) -> List[str]:
|
|
1467
|
+
"""Get list of all canonical fact types."""
|
|
1468
|
+
types = CODE_FACT_TYPES if code else CANONICAL_FACT_TYPES
|
|
1469
|
+
return list(types.keys())
|