alita-sdk 0.3.351__py3-none-any.whl → 0.3.499__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3601 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/artifact.py +1 -1
- alita_sdk/runtime/clients/client.py +214 -42
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +373 -0
- alita_sdk/runtime/langchain/assistant.py +118 -30
- alita_sdk/runtime/langchain/constants.py +8 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +41 -12
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/constants.py +116 -99
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +2 -2
- alita_sdk/runtime/langchain/langraph_agent.py +307 -71
- alita_sdk/runtime/langchain/utils.py +48 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/toolkits/__init__.py +26 -0
- alita_sdk/runtime/toolkits/application.py +9 -2
- alita_sdk/runtime/toolkits/artifact.py +18 -6
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +780 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/tools.py +205 -55
- alita_sdk/runtime/toolkits/vectorstore.py +9 -4
- alita_sdk/runtime/tools/__init__.py +11 -3
- alita_sdk/runtime/tools/application.py +7 -0
- alita_sdk/runtime/tools/artifact.py +225 -12
- alita_sdk/runtime/tools/function.py +95 -5
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +212 -0
- alita_sdk/runtime/tools/llm.py +494 -102
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +4 -4
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +180 -79
- alita_sdk/runtime/tools/vectorstore.py +22 -21
- alita_sdk/runtime/tools/vectorstore_base.py +125 -52
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +244 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +40 -13
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +12 -0
- alita_sdk/tools/__init__.py +77 -33
- alita_sdk/tools/ado/repos/__init__.py +7 -6
- alita_sdk/tools/ado/repos/repos_wrapper.py +11 -11
- alita_sdk/tools/ado/test_plan/__init__.py +7 -7
- alita_sdk/tools/ado/wiki/__init__.py +7 -11
- alita_sdk/tools/ado/wiki/ado_wrapper.py +89 -15
- alita_sdk/tools/ado/work_item/__init__.py +7 -11
- alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
- alita_sdk/tools/advanced_jira_mining/__init__.py +8 -7
- alita_sdk/tools/aws/delta_lake/__init__.py +11 -9
- alita_sdk/tools/azure_ai/search/__init__.py +7 -6
- alita_sdk/tools/base_indexer_toolkit.py +345 -70
- alita_sdk/tools/bitbucket/__init__.py +9 -8
- alita_sdk/tools/bitbucket/api_wrapper.py +50 -6
- alita_sdk/tools/browser/__init__.py +4 -4
- alita_sdk/tools/carrier/__init__.py +4 -6
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +7 -6
- alita_sdk/tools/cloud/azure/__init__.py +7 -6
- alita_sdk/tools/cloud/gcp/__init__.py +7 -6
- alita_sdk/tools/cloud/k8s/__init__.py +7 -6
- alita_sdk/tools/code/linter/__init__.py +7 -7
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +8 -7
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +9 -8
- alita_sdk/tools/confluence/api_wrapper.py +171 -75
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/custom_open_api/__init__.py +9 -4
- alita_sdk/tools/elastic/__init__.py +8 -7
- alita_sdk/tools/elitea_base.py +492 -52
- alita_sdk/tools/figma/__init__.py +7 -7
- alita_sdk/tools/figma/api_wrapper.py +2 -1
- alita_sdk/tools/github/__init__.py +9 -9
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +62 -2
- alita_sdk/tools/gitlab/__init__.py +8 -8
- alita_sdk/tools/gitlab/api_wrapper.py +135 -33
- alita_sdk/tools/gitlab_org/__init__.py +7 -8
- alita_sdk/tools/google/bigquery/__init__.py +11 -12
- alita_sdk/tools/google_places/__init__.py +8 -7
- alita_sdk/tools/jira/__init__.py +9 -7
- alita_sdk/tools/jira/api_wrapper.py +100 -52
- alita_sdk/tools/keycloak/__init__.py +8 -7
- alita_sdk/tools/localgit/local_git.py +56 -54
- alita_sdk/tools/memory/__init__.py +1 -1
- alita_sdk/tools/non_code_indexer_toolkit.py +3 -2
- alita_sdk/tools/ocr/__init__.py +8 -7
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/pandas/__init__.py +8 -7
- alita_sdk/tools/postman/__init__.py +7 -8
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +8 -9
- alita_sdk/tools/qtest/__init__.py +16 -11
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +7 -8
- alita_sdk/tools/report_portal/__init__.py +9 -7
- alita_sdk/tools/salesforce/__init__.py +7 -7
- alita_sdk/tools/servicenow/__init__.py +10 -10
- alita_sdk/tools/sharepoint/__init__.py +7 -6
- alita_sdk/tools/sharepoint/api_wrapper.py +127 -36
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +7 -6
- alita_sdk/tools/sql/__init__.py +8 -7
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +7 -6
- alita_sdk/tools/testrail/__init__.py +8 -9
- alita_sdk/tools/utils/__init__.py +26 -4
- alita_sdk/tools/utils/content_parser.py +88 -60
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +76 -26
- alita_sdk/tools/xray/__init__.py +9 -7
- alita_sdk/tools/zephyr/__init__.py +7 -6
- alita_sdk/tools/zephyr_enterprise/__init__.py +8 -6
- alita_sdk/tools/zephyr_essential/__init__.py +7 -6
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- alita_sdk/tools/zephyr_scale/__init__.py +7 -6
- alita_sdk/tools/zephyr_squad/__init__.py +7 -6
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +147 -2
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/RECORD +206 -130
- alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML document parser for extracting links, scripts, and references.
|
|
3
|
+
|
|
4
|
+
Extracts links, script imports, stylesheets, and other references from HTML documents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import List, Optional, Set
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .base import (
|
|
12
|
+
BaseParser, Symbol, Relationship, ParseResult,
|
|
13
|
+
RelationshipType, Range
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HTMLParser(BaseParser):
|
|
18
|
+
"""
|
|
19
|
+
Parser for HTML documents.
|
|
20
|
+
|
|
21
|
+
Extracts:
|
|
22
|
+
- Anchor links (<a href="">)
|
|
23
|
+
- Script imports (<script src="">)
|
|
24
|
+
- Stylesheet links (<link href="">)
|
|
25
|
+
- Image sources (<img src="">)
|
|
26
|
+
- Form actions
|
|
27
|
+
- Meta references
|
|
28
|
+
- Embedded data attributes
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
language = "html"
|
|
32
|
+
file_extensions = ['.html', '.htm', '.xhtml', '.vue', '.svelte']
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
"""Initialize the HTML parser."""
|
|
36
|
+
super().__init__(language=self.language)
|
|
37
|
+
|
|
38
|
+
def _get_supported_extensions(self) -> Set[str]:
|
|
39
|
+
"""Return supported file extensions."""
|
|
40
|
+
return {'.html', '.htm', '.xhtml', '.vue', '.svelte'}
|
|
41
|
+
|
|
42
|
+
# Patterns for HTML elements
|
|
43
|
+
PATTERNS = {
|
|
44
|
+
# Anchor links
|
|
45
|
+
'anchor': re.compile(r'<a\s+[^>]*href=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
46
|
+
|
|
47
|
+
# Script sources
|
|
48
|
+
'script': re.compile(r'<script\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
49
|
+
|
|
50
|
+
# Stylesheet links
|
|
51
|
+
'stylesheet': re.compile(r'<link\s+[^>]*href=["\']([^"\']+\.css(?:\?[^"\']*)?)["\']', re.IGNORECASE),
|
|
52
|
+
|
|
53
|
+
# Image sources
|
|
54
|
+
'image': re.compile(r'<img\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
55
|
+
|
|
56
|
+
# Form actions
|
|
57
|
+
'form_action': re.compile(r'<form\s+[^>]*action=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
58
|
+
|
|
59
|
+
# iframe sources
|
|
60
|
+
'iframe': re.compile(r'<iframe\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
61
|
+
|
|
62
|
+
# Video/audio sources
|
|
63
|
+
'media': re.compile(r'<(?:video|audio|source)\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
64
|
+
|
|
65
|
+
# Object/embed data
|
|
66
|
+
'embed': re.compile(r'<(?:object|embed)\s+[^>]*(?:data|src)=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
67
|
+
|
|
68
|
+
# Meta refresh/canonical
|
|
69
|
+
'meta_url': re.compile(r'<meta\s+[^>]*(?:content|href)=["\'][^"\']*url=([^"\';\s]+)', re.IGNORECASE),
|
|
70
|
+
|
|
71
|
+
# Background images in style
|
|
72
|
+
'bg_image': re.compile(r'background(?:-image)?:\s*url\(["\']?([^"\')\s]+)["\']?\)', re.IGNORECASE),
|
|
73
|
+
|
|
74
|
+
# Data attributes that might contain URLs
|
|
75
|
+
'data_url': re.compile(r'data-(?:src|href|url)=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
76
|
+
|
|
77
|
+
# Title tag (for document identification)
|
|
78
|
+
'title': re.compile(r'<title>([^<]+)</title>', re.IGNORECASE),
|
|
79
|
+
|
|
80
|
+
# ID attributes (for potential anchor targets)
|
|
81
|
+
'id_attr': re.compile(r'<(\w+)\s+[^>]*id=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
82
|
+
|
|
83
|
+
# Comments that might contain references
|
|
84
|
+
'html_comment': re.compile(r'<!--\s*(?:TODO|FIXME|NOTE|SEE|REF):\s*([^-]+)-->', re.IGNORECASE),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
def _make_range(self, start_line: int, end_line: int = None) -> Range:
|
|
88
|
+
"""Create a Range object."""
|
|
89
|
+
return Range(
|
|
90
|
+
start_line=start_line,
|
|
91
|
+
end_line=end_line or start_line,
|
|
92
|
+
start_col=0,
|
|
93
|
+
end_col=0
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _make_symbol(
|
|
97
|
+
self,
|
|
98
|
+
name: str,
|
|
99
|
+
symbol_type: str,
|
|
100
|
+
line: int,
|
|
101
|
+
file_path: str,
|
|
102
|
+
scope: str = "document",
|
|
103
|
+
**kwargs
|
|
104
|
+
) -> Symbol:
|
|
105
|
+
"""Create a Symbol with proper fields."""
|
|
106
|
+
return Symbol(
|
|
107
|
+
name=name,
|
|
108
|
+
symbol_type=symbol_type,
|
|
109
|
+
scope=scope,
|
|
110
|
+
range=self._make_range(line),
|
|
111
|
+
file_path=file_path,
|
|
112
|
+
**kwargs
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _make_relationship(
|
|
116
|
+
self,
|
|
117
|
+
source: str,
|
|
118
|
+
target: str,
|
|
119
|
+
rel_type: RelationshipType,
|
|
120
|
+
file_path: str,
|
|
121
|
+
line: int,
|
|
122
|
+
confidence: float = 0.90
|
|
123
|
+
) -> Relationship:
|
|
124
|
+
"""Create a Relationship with proper fields."""
|
|
125
|
+
return Relationship(
|
|
126
|
+
source_symbol=source,
|
|
127
|
+
target_symbol=target,
|
|
128
|
+
relationship_type=rel_type,
|
|
129
|
+
source_file=file_path,
|
|
130
|
+
source_range=self._make_range(line),
|
|
131
|
+
confidence=confidence
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def _get_line_number(self, content: str, match_start: int) -> int:
|
|
135
|
+
"""Get line number from character position."""
|
|
136
|
+
return content[:match_start].count('\n') + 1
|
|
137
|
+
|
|
138
|
+
def parse_file(self, file_path: str, content: Optional[str] = None) -> ParseResult:
|
|
139
|
+
"""
|
|
140
|
+
Parse an HTML file for links and references.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
file_path: Path to the file
|
|
144
|
+
content: Optional file content (read from file if not provided)
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
ParseResult with symbols (anchors, ids) and relationships (links, imports)
|
|
148
|
+
"""
|
|
149
|
+
if content is None:
|
|
150
|
+
try:
|
|
151
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
152
|
+
content = f.read()
|
|
153
|
+
except Exception:
|
|
154
|
+
return ParseResult(symbols=[], relationships=[], errors=[f"Could not read {file_path}"])
|
|
155
|
+
|
|
156
|
+
symbols: List[Symbol] = []
|
|
157
|
+
relationships: List[Relationship] = []
|
|
158
|
+
errors: List[str] = []
|
|
159
|
+
|
|
160
|
+
# Document name for source references
|
|
161
|
+
doc_name = Path(file_path).stem
|
|
162
|
+
|
|
163
|
+
# Extract title if present
|
|
164
|
+
self._extract_title(content, file_path, symbols)
|
|
165
|
+
|
|
166
|
+
# Extract ID attributes as potential anchor targets
|
|
167
|
+
self._extract_ids(content, file_path, symbols)
|
|
168
|
+
|
|
169
|
+
# Extract all link types
|
|
170
|
+
self._extract_anchors(content, file_path, doc_name, relationships)
|
|
171
|
+
self._extract_scripts(content, file_path, doc_name, relationships)
|
|
172
|
+
self._extract_stylesheets(content, file_path, doc_name, relationships)
|
|
173
|
+
self._extract_images(content, file_path, doc_name, relationships)
|
|
174
|
+
self._extract_forms(content, file_path, doc_name, relationships)
|
|
175
|
+
self._extract_media(content, file_path, doc_name, relationships)
|
|
176
|
+
self._extract_embeds(content, file_path, doc_name, relationships)
|
|
177
|
+
self._extract_background_images(content, file_path, doc_name, relationships)
|
|
178
|
+
self._extract_data_urls(content, file_path, doc_name, relationships)
|
|
179
|
+
|
|
180
|
+
return ParseResult(
|
|
181
|
+
symbols=symbols,
|
|
182
|
+
relationships=relationships,
|
|
183
|
+
errors=errors
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def _extract_title(self, content: str, file_path: str, symbols: List[Symbol]):
|
|
187
|
+
"""Extract document title."""
|
|
188
|
+
match = self.PATTERNS['title'].search(content)
|
|
189
|
+
if match:
|
|
190
|
+
title = match.group(1).strip()
|
|
191
|
+
line = self._get_line_number(content, match.start())
|
|
192
|
+
symbols.append(self._make_symbol(
|
|
193
|
+
name=title,
|
|
194
|
+
symbol_type="document_title",
|
|
195
|
+
line=line,
|
|
196
|
+
file_path=file_path
|
|
197
|
+
))
|
|
198
|
+
|
|
199
|
+
def _extract_ids(self, content: str, file_path: str, symbols: List[Symbol]):
|
|
200
|
+
"""Extract elements with IDs as potential anchor targets."""
|
|
201
|
+
for match in self.PATTERNS['id_attr'].finditer(content):
|
|
202
|
+
tag = match.group(1)
|
|
203
|
+
id_value = match.group(2)
|
|
204
|
+
line = self._get_line_number(content, match.start())
|
|
205
|
+
|
|
206
|
+
symbols.append(self._make_symbol(
|
|
207
|
+
name=f"#{id_value}",
|
|
208
|
+
symbol_type="anchor_target",
|
|
209
|
+
line=line,
|
|
210
|
+
file_path=file_path,
|
|
211
|
+
metadata={'tag': tag}
|
|
212
|
+
))
|
|
213
|
+
|
|
214
|
+
def _extract_anchors(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
215
|
+
"""Extract anchor links."""
|
|
216
|
+
for match in self.PATTERNS['anchor'].finditer(content):
|
|
217
|
+
href = match.group(1)
|
|
218
|
+
line = self._get_line_number(content, match.start())
|
|
219
|
+
|
|
220
|
+
# Skip empty or javascript: links
|
|
221
|
+
if not href or href.startswith(('javascript:', '#', 'mailto:', 'tel:')):
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
relationships.append(self._make_relationship(
|
|
225
|
+
source=doc_name,
|
|
226
|
+
target=self._normalize_url(href),
|
|
227
|
+
rel_type=RelationshipType.REFERENCES,
|
|
228
|
+
file_path=file_path,
|
|
229
|
+
line=line
|
|
230
|
+
))
|
|
231
|
+
|
|
232
|
+
def _extract_scripts(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
233
|
+
"""Extract script imports."""
|
|
234
|
+
for match in self.PATTERNS['script'].finditer(content):
|
|
235
|
+
src = match.group(1)
|
|
236
|
+
line = self._get_line_number(content, match.start())
|
|
237
|
+
|
|
238
|
+
relationships.append(self._make_relationship(
|
|
239
|
+
source=doc_name,
|
|
240
|
+
target=self._normalize_url(src),
|
|
241
|
+
rel_type=RelationshipType.IMPORTS,
|
|
242
|
+
file_path=file_path,
|
|
243
|
+
line=line,
|
|
244
|
+
confidence=0.95
|
|
245
|
+
))
|
|
246
|
+
|
|
247
|
+
def _extract_stylesheets(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
248
|
+
"""Extract stylesheet links."""
|
|
249
|
+
for match in self.PATTERNS['stylesheet'].finditer(content):
|
|
250
|
+
href = match.group(1)
|
|
251
|
+
line = self._get_line_number(content, match.start())
|
|
252
|
+
|
|
253
|
+
relationships.append(self._make_relationship(
|
|
254
|
+
source=doc_name,
|
|
255
|
+
target=self._normalize_url(href),
|
|
256
|
+
rel_type=RelationshipType.IMPORTS,
|
|
257
|
+
file_path=file_path,
|
|
258
|
+
line=line,
|
|
259
|
+
confidence=0.95
|
|
260
|
+
))
|
|
261
|
+
|
|
262
|
+
def _extract_images(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
263
|
+
"""Extract image sources."""
|
|
264
|
+
for match in self.PATTERNS['image'].finditer(content):
|
|
265
|
+
src = match.group(1)
|
|
266
|
+
line = self._get_line_number(content, match.start())
|
|
267
|
+
|
|
268
|
+
# Skip data URIs
|
|
269
|
+
if src.startswith('data:'):
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
relationships.append(self._make_relationship(
|
|
273
|
+
source=doc_name,
|
|
274
|
+
target=self._normalize_url(src),
|
|
275
|
+
rel_type=RelationshipType.REFERENCES,
|
|
276
|
+
file_path=file_path,
|
|
277
|
+
line=line,
|
|
278
|
+
confidence=0.85
|
|
279
|
+
))
|
|
280
|
+
|
|
281
|
+
def _extract_forms(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
282
|
+
"""Extract form actions."""
|
|
283
|
+
for match in self.PATTERNS['form_action'].finditer(content):
|
|
284
|
+
action = match.group(1)
|
|
285
|
+
line = self._get_line_number(content, match.start())
|
|
286
|
+
|
|
287
|
+
if action and not action.startswith('#'):
|
|
288
|
+
relationships.append(self._make_relationship(
|
|
289
|
+
source=doc_name,
|
|
290
|
+
target=self._normalize_url(action),
|
|
291
|
+
rel_type=RelationshipType.REFERENCES,
|
|
292
|
+
file_path=file_path,
|
|
293
|
+
line=line,
|
|
294
|
+
confidence=0.80
|
|
295
|
+
))
|
|
296
|
+
|
|
297
|
+
def _extract_media(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
298
|
+
"""Extract video/audio sources."""
|
|
299
|
+
for match in self.PATTERNS['media'].finditer(content):
|
|
300
|
+
src = match.group(1)
|
|
301
|
+
line = self._get_line_number(content, match.start())
|
|
302
|
+
|
|
303
|
+
relationships.append(self._make_relationship(
|
|
304
|
+
source=doc_name,
|
|
305
|
+
target=self._normalize_url(src),
|
|
306
|
+
rel_type=RelationshipType.REFERENCES,
|
|
307
|
+
file_path=file_path,
|
|
308
|
+
line=line,
|
|
309
|
+
confidence=0.85
|
|
310
|
+
))
|
|
311
|
+
|
|
312
|
+
# Also check iframe
|
|
313
|
+
for match in self.PATTERNS['iframe'].finditer(content):
|
|
314
|
+
src = match.group(1)
|
|
315
|
+
line = self._get_line_number(content, match.start())
|
|
316
|
+
|
|
317
|
+
relationships.append(self._make_relationship(
|
|
318
|
+
source=doc_name,
|
|
319
|
+
target=self._normalize_url(src),
|
|
320
|
+
rel_type=RelationshipType.REFERENCES,
|
|
321
|
+
file_path=file_path,
|
|
322
|
+
line=line,
|
|
323
|
+
confidence=0.80
|
|
324
|
+
))
|
|
325
|
+
|
|
326
|
+
def _extract_embeds(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
327
|
+
"""Extract object/embed sources."""
|
|
328
|
+
for match in self.PATTERNS['embed'].finditer(content):
|
|
329
|
+
src = match.group(1)
|
|
330
|
+
line = self._get_line_number(content, match.start())
|
|
331
|
+
|
|
332
|
+
relationships.append(self._make_relationship(
|
|
333
|
+
source=doc_name,
|
|
334
|
+
target=self._normalize_url(src),
|
|
335
|
+
rel_type=RelationshipType.REFERENCES,
|
|
336
|
+
file_path=file_path,
|
|
337
|
+
line=line,
|
|
338
|
+
confidence=0.80
|
|
339
|
+
))
|
|
340
|
+
|
|
341
|
+
def _extract_background_images(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
342
|
+
"""Extract background images from inline styles."""
|
|
343
|
+
for match in self.PATTERNS['bg_image'].finditer(content):
|
|
344
|
+
url = match.group(1)
|
|
345
|
+
line = self._get_line_number(content, match.start())
|
|
346
|
+
|
|
347
|
+
if not url.startswith('data:'):
|
|
348
|
+
relationships.append(self._make_relationship(
|
|
349
|
+
source=doc_name,
|
|
350
|
+
target=self._normalize_url(url),
|
|
351
|
+
rel_type=RelationshipType.REFERENCES,
|
|
352
|
+
file_path=file_path,
|
|
353
|
+
line=line,
|
|
354
|
+
confidence=0.75
|
|
355
|
+
))
|
|
356
|
+
|
|
357
|
+
def _extract_data_urls(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
|
|
358
|
+
"""Extract URLs from data attributes."""
|
|
359
|
+
for match in self.PATTERNS['data_url'].finditer(content):
|
|
360
|
+
url = match.group(1)
|
|
361
|
+
line = self._get_line_number(content, match.start())
|
|
362
|
+
|
|
363
|
+
if not url.startswith('data:'):
|
|
364
|
+
relationships.append(self._make_relationship(
|
|
365
|
+
source=doc_name,
|
|
366
|
+
target=self._normalize_url(url),
|
|
367
|
+
rel_type=RelationshipType.REFERENCES,
|
|
368
|
+
file_path=file_path,
|
|
369
|
+
line=line,
|
|
370
|
+
confidence=0.70
|
|
371
|
+
))
|
|
372
|
+
|
|
373
|
+
def _normalize_url(self, url: str) -> str:
|
|
374
|
+
"""Normalize URL for consistent reference."""
|
|
375
|
+
url = url.strip()
|
|
376
|
+
|
|
377
|
+
# Keep full URLs
|
|
378
|
+
if url.startswith(('http://', 'https://', '//')):
|
|
379
|
+
return url
|
|
380
|
+
|
|
381
|
+
# Clean relative paths
|
|
382
|
+
if url.startswith('./'):
|
|
383
|
+
url = url[2:]
|
|
384
|
+
|
|
385
|
+
# Remove query strings for local files
|
|
386
|
+
if '?' in url and not url.startswith(('http://', 'https://')):
|
|
387
|
+
url = url.split('?')[0]
|
|
388
|
+
|
|
389
|
+
return url
|