alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +111 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +407 -92
  110. alita_sdk/runtime/langchain/utils.py +102 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +24 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +780 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1013 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/mcp_client.py +492 -0
  155. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  156. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  157. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  158. alita_sdk/runtime/utils/streamlit.py +41 -14
  159. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  160. alita_sdk/runtime/utils/utils.py +48 -0
  161. alita_sdk/tools/__init__.py +135 -37
  162. alita_sdk/tools/ado/__init__.py +2 -2
  163. alita_sdk/tools/ado/repos/__init__.py +15 -19
  164. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  165. alita_sdk/tools/ado/test_plan/__init__.py +26 -8
  166. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  167. alita_sdk/tools/ado/wiki/__init__.py +27 -12
  168. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  169. alita_sdk/tools/ado/work_item/__init__.py +27 -12
  170. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  171. alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
  172. alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
  173. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  174. alita_sdk/tools/azure_ai/search/__init__.py +13 -8
  175. alita_sdk/tools/base/tool.py +5 -1
  176. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  177. alita_sdk/tools/bitbucket/__init__.py +27 -19
  178. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  179. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  180. alita_sdk/tools/browser/__init__.py +41 -16
  181. alita_sdk/tools/browser/crawler.py +3 -1
  182. alita_sdk/tools/browser/utils.py +15 -6
  183. alita_sdk/tools/carrier/__init__.py +18 -17
  184. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  185. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  186. alita_sdk/tools/chunkers/__init__.py +3 -1
  187. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  188. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  189. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  190. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  191. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  192. alita_sdk/tools/cloud/aws/__init__.py +11 -7
  193. alita_sdk/tools/cloud/azure/__init__.py +11 -7
  194. alita_sdk/tools/cloud/gcp/__init__.py +11 -7
  195. alita_sdk/tools/cloud/k8s/__init__.py +11 -7
  196. alita_sdk/tools/code/linter/__init__.py +9 -8
  197. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  198. alita_sdk/tools/code/sonar/__init__.py +20 -13
  199. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  200. alita_sdk/tools/confluence/__init__.py +21 -14
  201. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  202. alita_sdk/tools/confluence/loader.py +14 -2
  203. alita_sdk/tools/custom_open_api/__init__.py +11 -5
  204. alita_sdk/tools/elastic/__init__.py +10 -8
  205. alita_sdk/tools/elitea_base.py +546 -64
  206. alita_sdk/tools/figma/__init__.py +11 -8
  207. alita_sdk/tools/figma/api_wrapper.py +352 -153
  208. alita_sdk/tools/github/__init__.py +17 -17
  209. alita_sdk/tools/github/api_wrapper.py +9 -26
  210. alita_sdk/tools/github/github_client.py +81 -12
  211. alita_sdk/tools/github/schemas.py +2 -1
  212. alita_sdk/tools/github/tool.py +5 -1
  213. alita_sdk/tools/gitlab/__init__.py +18 -13
  214. alita_sdk/tools/gitlab/api_wrapper.py +224 -80
  215. alita_sdk/tools/gitlab_org/__init__.py +13 -10
  216. alita_sdk/tools/google/bigquery/__init__.py +13 -13
  217. alita_sdk/tools/google/bigquery/tool.py +5 -1
  218. alita_sdk/tools/google_places/__init__.py +20 -11
  219. alita_sdk/tools/jira/__init__.py +21 -11
  220. alita_sdk/tools/jira/api_wrapper.py +315 -168
  221. alita_sdk/tools/keycloak/__init__.py +10 -8
  222. alita_sdk/tools/localgit/__init__.py +8 -3
  223. alita_sdk/tools/localgit/local_git.py +62 -54
  224. alita_sdk/tools/localgit/tool.py +5 -1
  225. alita_sdk/tools/memory/__init__.py +38 -14
  226. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  227. alita_sdk/tools/ocr/__init__.py +10 -8
  228. alita_sdk/tools/openapi/__init__.py +281 -108
  229. alita_sdk/tools/openapi/api_wrapper.py +883 -0
  230. alita_sdk/tools/openapi/tool.py +20 -0
  231. alita_sdk/tools/pandas/__init__.py +18 -11
  232. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  233. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  234. alita_sdk/tools/postman/__init__.py +10 -11
  235. alita_sdk/tools/postman/api_wrapper.py +19 -8
  236. alita_sdk/tools/postman/postman_analysis.py +8 -1
  237. alita_sdk/tools/pptx/__init__.py +10 -10
  238. alita_sdk/tools/qtest/__init__.py +21 -14
  239. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  240. alita_sdk/tools/rally/__init__.py +12 -10
  241. alita_sdk/tools/report_portal/__init__.py +22 -16
  242. alita_sdk/tools/salesforce/__init__.py +21 -16
  243. alita_sdk/tools/servicenow/__init__.py +20 -16
  244. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  245. alita_sdk/tools/sharepoint/__init__.py +16 -14
  246. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  247. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  248. alita_sdk/tools/sharepoint/utils.py +8 -2
  249. alita_sdk/tools/slack/__init__.py +11 -7
  250. alita_sdk/tools/sql/__init__.py +21 -19
  251. alita_sdk/tools/sql/api_wrapper.py +71 -23
  252. alita_sdk/tools/testio/__init__.py +20 -13
  253. alita_sdk/tools/testrail/__init__.py +12 -11
  254. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  255. alita_sdk/tools/utils/__init__.py +28 -4
  256. alita_sdk/tools/utils/content_parser.py +182 -62
  257. alita_sdk/tools/utils/text_operations.py +254 -0
  258. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  259. alita_sdk/tools/xray/__init__.py +17 -14
  260. alita_sdk/tools/xray/api_wrapper.py +58 -113
  261. alita_sdk/tools/yagmail/__init__.py +8 -3
  262. alita_sdk/tools/zephyr/__init__.py +11 -7
  263. alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
  264. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  265. alita_sdk/tools/zephyr_essential/__init__.py +15 -10
  266. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  267. alita_sdk/tools/zephyr_essential/client.py +6 -4
  268. alita_sdk/tools/zephyr_scale/__init__.py +12 -8
  269. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  270. alita_sdk/tools/zephyr_squad/__init__.py +11 -7
  271. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
  272. alita_sdk-0.3.562.dist-info/RECORD +450 -0
  273. alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
  274. alita_sdk/tools/bitbucket/tools.py +0 -304
  275. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  276. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,389 @@
1
+ """
2
+ HTML document parser for extracting links, scripts, and references.
3
+
4
+ Extracts links, script imports, stylesheets, and other references from HTML documents.
5
+ """
6
+
7
+ import re
8
+ from typing import List, Optional, Set
9
+ from pathlib import Path
10
+
11
+ from .base import (
12
+ BaseParser, Symbol, Relationship, ParseResult,
13
+ RelationshipType, Range
14
+ )
15
+
16
+
17
+ class HTMLParser(BaseParser):
18
+ """
19
+ Parser for HTML documents.
20
+
21
+ Extracts:
22
+ - Anchor links (<a href="">)
23
+ - Script imports (<script src="">)
24
+ - Stylesheet links (<link href="">)
25
+ - Image sources (<img src="">)
26
+ - Form actions
27
+ - Meta references
28
+ - Embedded data attributes
29
+ """
30
+
31
+ language = "html"
32
+ file_extensions = ['.html', '.htm', '.xhtml', '.vue', '.svelte']
33
+
34
+ def __init__(self):
35
+ """Initialize the HTML parser."""
36
+ super().__init__(language=self.language)
37
+
38
+ def _get_supported_extensions(self) -> Set[str]:
39
+ """Return supported file extensions."""
40
+ return {'.html', '.htm', '.xhtml', '.vue', '.svelte'}
41
+
42
+ # Patterns for HTML elements
43
+ PATTERNS = {
44
+ # Anchor links
45
+ 'anchor': re.compile(r'<a\s+[^>]*href=["\']([^"\']+)["\']', re.IGNORECASE),
46
+
47
+ # Script sources
48
+ 'script': re.compile(r'<script\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
49
+
50
+ # Stylesheet links
51
+ 'stylesheet': re.compile(r'<link\s+[^>]*href=["\']([^"\']+\.css(?:\?[^"\']*)?)["\']', re.IGNORECASE),
52
+
53
+ # Image sources
54
+ 'image': re.compile(r'<img\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
55
+
56
+ # Form actions
57
+ 'form_action': re.compile(r'<form\s+[^>]*action=["\']([^"\']+)["\']', re.IGNORECASE),
58
+
59
+ # iframe sources
60
+ 'iframe': re.compile(r'<iframe\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
61
+
62
+ # Video/audio sources
63
+ 'media': re.compile(r'<(?:video|audio|source)\s+[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE),
64
+
65
+ # Object/embed data
66
+ 'embed': re.compile(r'<(?:object|embed)\s+[^>]*(?:data|src)=["\']([^"\']+)["\']', re.IGNORECASE),
67
+
68
+ # Meta refresh/canonical
69
+ 'meta_url': re.compile(r'<meta\s+[^>]*(?:content|href)=["\'][^"\']*url=([^"\';\s]+)', re.IGNORECASE),
70
+
71
+ # Background images in style
72
+ 'bg_image': re.compile(r'background(?:-image)?:\s*url\(["\']?([^"\')\s]+)["\']?\)', re.IGNORECASE),
73
+
74
+ # Data attributes that might contain URLs
75
+ 'data_url': re.compile(r'data-(?:src|href|url)=["\']([^"\']+)["\']', re.IGNORECASE),
76
+
77
+ # Title tag (for document identification)
78
+ 'title': re.compile(r'<title>([^<]+)</title>', re.IGNORECASE),
79
+
80
+ # ID attributes (for potential anchor targets)
81
+ 'id_attr': re.compile(r'<(\w+)\s+[^>]*id=["\']([^"\']+)["\']', re.IGNORECASE),
82
+
83
+ # Comments that might contain references
84
+ 'html_comment': re.compile(r'<!--\s*(?:TODO|FIXME|NOTE|SEE|REF):\s*([^-]+)-->', re.IGNORECASE),
85
+ }
86
+
87
+ def _make_range(self, start_line: int, end_line: int = None) -> Range:
88
+ """Create a Range object."""
89
+ return Range(
90
+ start_line=start_line,
91
+ end_line=end_line or start_line,
92
+ start_col=0,
93
+ end_col=0
94
+ )
95
+
96
+ def _make_symbol(
97
+ self,
98
+ name: str,
99
+ symbol_type: str,
100
+ line: int,
101
+ file_path: str,
102
+ scope: str = "document",
103
+ **kwargs
104
+ ) -> Symbol:
105
+ """Create a Symbol with proper fields."""
106
+ return Symbol(
107
+ name=name,
108
+ symbol_type=symbol_type,
109
+ scope=scope,
110
+ range=self._make_range(line),
111
+ file_path=file_path,
112
+ **kwargs
113
+ )
114
+
115
+ def _make_relationship(
116
+ self,
117
+ source: str,
118
+ target: str,
119
+ rel_type: RelationshipType,
120
+ file_path: str,
121
+ line: int,
122
+ confidence: float = 0.90
123
+ ) -> Relationship:
124
+ """Create a Relationship with proper fields."""
125
+ return Relationship(
126
+ source_symbol=source,
127
+ target_symbol=target,
128
+ relationship_type=rel_type,
129
+ source_file=file_path,
130
+ source_range=self._make_range(line),
131
+ confidence=confidence
132
+ )
133
+
134
+ def _get_line_number(self, content: str, match_start: int) -> int:
135
+ """Get line number from character position."""
136
+ return content[:match_start].count('\n') + 1
137
+
138
+ def parse_file(self, file_path: str, content: Optional[str] = None) -> ParseResult:
139
+ """
140
+ Parse an HTML file for links and references.
141
+
142
+ Args:
143
+ file_path: Path to the file
144
+ content: Optional file content (read from file if not provided)
145
+
146
+ Returns:
147
+ ParseResult with symbols (anchors, ids) and relationships (links, imports)
148
+ """
149
+ if content is None:
150
+ try:
151
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
152
+ content = f.read()
153
+ except Exception:
154
+ return ParseResult(symbols=[], relationships=[], errors=[f"Could not read {file_path}"])
155
+
156
+ symbols: List[Symbol] = []
157
+ relationships: List[Relationship] = []
158
+ errors: List[str] = []
159
+
160
+ # Document name for source references
161
+ doc_name = Path(file_path).stem
162
+
163
+ # Extract title if present
164
+ self._extract_title(content, file_path, symbols)
165
+
166
+ # Extract ID attributes as potential anchor targets
167
+ self._extract_ids(content, file_path, symbols)
168
+
169
+ # Extract all link types
170
+ self._extract_anchors(content, file_path, doc_name, relationships)
171
+ self._extract_scripts(content, file_path, doc_name, relationships)
172
+ self._extract_stylesheets(content, file_path, doc_name, relationships)
173
+ self._extract_images(content, file_path, doc_name, relationships)
174
+ self._extract_forms(content, file_path, doc_name, relationships)
175
+ self._extract_media(content, file_path, doc_name, relationships)
176
+ self._extract_embeds(content, file_path, doc_name, relationships)
177
+ self._extract_background_images(content, file_path, doc_name, relationships)
178
+ self._extract_data_urls(content, file_path, doc_name, relationships)
179
+
180
+ return ParseResult(
181
+ symbols=symbols,
182
+ relationships=relationships,
183
+ errors=errors
184
+ )
185
+
186
+ def _extract_title(self, content: str, file_path: str, symbols: List[Symbol]):
187
+ """Extract document title."""
188
+ match = self.PATTERNS['title'].search(content)
189
+ if match:
190
+ title = match.group(1).strip()
191
+ line = self._get_line_number(content, match.start())
192
+ symbols.append(self._make_symbol(
193
+ name=title,
194
+ symbol_type="document_title",
195
+ line=line,
196
+ file_path=file_path
197
+ ))
198
+
199
+ def _extract_ids(self, content: str, file_path: str, symbols: List[Symbol]):
200
+ """Extract elements with IDs as potential anchor targets."""
201
+ for match in self.PATTERNS['id_attr'].finditer(content):
202
+ tag = match.group(1)
203
+ id_value = match.group(2)
204
+ line = self._get_line_number(content, match.start())
205
+
206
+ symbols.append(self._make_symbol(
207
+ name=f"#{id_value}",
208
+ symbol_type="anchor_target",
209
+ line=line,
210
+ file_path=file_path,
211
+ metadata={'tag': tag}
212
+ ))
213
+
214
+ def _extract_anchors(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
215
+ """Extract anchor links."""
216
+ for match in self.PATTERNS['anchor'].finditer(content):
217
+ href = match.group(1)
218
+ line = self._get_line_number(content, match.start())
219
+
220
+ # Skip empty or javascript: links
221
+ if not href or href.startswith(('javascript:', '#', 'mailto:', 'tel:')):
222
+ continue
223
+
224
+ relationships.append(self._make_relationship(
225
+ source=doc_name,
226
+ target=self._normalize_url(href),
227
+ rel_type=RelationshipType.REFERENCES,
228
+ file_path=file_path,
229
+ line=line
230
+ ))
231
+
232
+ def _extract_scripts(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
233
+ """Extract script imports."""
234
+ for match in self.PATTERNS['script'].finditer(content):
235
+ src = match.group(1)
236
+ line = self._get_line_number(content, match.start())
237
+
238
+ relationships.append(self._make_relationship(
239
+ source=doc_name,
240
+ target=self._normalize_url(src),
241
+ rel_type=RelationshipType.IMPORTS,
242
+ file_path=file_path,
243
+ line=line,
244
+ confidence=0.95
245
+ ))
246
+
247
+ def _extract_stylesheets(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
248
+ """Extract stylesheet links."""
249
+ for match in self.PATTERNS['stylesheet'].finditer(content):
250
+ href = match.group(1)
251
+ line = self._get_line_number(content, match.start())
252
+
253
+ relationships.append(self._make_relationship(
254
+ source=doc_name,
255
+ target=self._normalize_url(href),
256
+ rel_type=RelationshipType.IMPORTS,
257
+ file_path=file_path,
258
+ line=line,
259
+ confidence=0.95
260
+ ))
261
+
262
+ def _extract_images(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
263
+ """Extract image sources."""
264
+ for match in self.PATTERNS['image'].finditer(content):
265
+ src = match.group(1)
266
+ line = self._get_line_number(content, match.start())
267
+
268
+ # Skip data URIs
269
+ if src.startswith('data:'):
270
+ continue
271
+
272
+ relationships.append(self._make_relationship(
273
+ source=doc_name,
274
+ target=self._normalize_url(src),
275
+ rel_type=RelationshipType.REFERENCES,
276
+ file_path=file_path,
277
+ line=line,
278
+ confidence=0.85
279
+ ))
280
+
281
+ def _extract_forms(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
282
+ """Extract form actions."""
283
+ for match in self.PATTERNS['form_action'].finditer(content):
284
+ action = match.group(1)
285
+ line = self._get_line_number(content, match.start())
286
+
287
+ if action and not action.startswith('#'):
288
+ relationships.append(self._make_relationship(
289
+ source=doc_name,
290
+ target=self._normalize_url(action),
291
+ rel_type=RelationshipType.REFERENCES,
292
+ file_path=file_path,
293
+ line=line,
294
+ confidence=0.80
295
+ ))
296
+
297
+ def _extract_media(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
298
+ """Extract video/audio sources."""
299
+ for match in self.PATTERNS['media'].finditer(content):
300
+ src = match.group(1)
301
+ line = self._get_line_number(content, match.start())
302
+
303
+ relationships.append(self._make_relationship(
304
+ source=doc_name,
305
+ target=self._normalize_url(src),
306
+ rel_type=RelationshipType.REFERENCES,
307
+ file_path=file_path,
308
+ line=line,
309
+ confidence=0.85
310
+ ))
311
+
312
+ # Also check iframe
313
+ for match in self.PATTERNS['iframe'].finditer(content):
314
+ src = match.group(1)
315
+ line = self._get_line_number(content, match.start())
316
+
317
+ relationships.append(self._make_relationship(
318
+ source=doc_name,
319
+ target=self._normalize_url(src),
320
+ rel_type=RelationshipType.REFERENCES,
321
+ file_path=file_path,
322
+ line=line,
323
+ confidence=0.80
324
+ ))
325
+
326
+ def _extract_embeds(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
327
+ """Extract object/embed sources."""
328
+ for match in self.PATTERNS['embed'].finditer(content):
329
+ src = match.group(1)
330
+ line = self._get_line_number(content, match.start())
331
+
332
+ relationships.append(self._make_relationship(
333
+ source=doc_name,
334
+ target=self._normalize_url(src),
335
+ rel_type=RelationshipType.REFERENCES,
336
+ file_path=file_path,
337
+ line=line,
338
+ confidence=0.80
339
+ ))
340
+
341
+ def _extract_background_images(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
342
+ """Extract background images from inline styles."""
343
+ for match in self.PATTERNS['bg_image'].finditer(content):
344
+ url = match.group(1)
345
+ line = self._get_line_number(content, match.start())
346
+
347
+ if not url.startswith('data:'):
348
+ relationships.append(self._make_relationship(
349
+ source=doc_name,
350
+ target=self._normalize_url(url),
351
+ rel_type=RelationshipType.REFERENCES,
352
+ file_path=file_path,
353
+ line=line,
354
+ confidence=0.75
355
+ ))
356
+
357
+ def _extract_data_urls(self, content: str, file_path: str, doc_name: str, relationships: List[Relationship]):
358
+ """Extract URLs from data attributes."""
359
+ for match in self.PATTERNS['data_url'].finditer(content):
360
+ url = match.group(1)
361
+ line = self._get_line_number(content, match.start())
362
+
363
+ if not url.startswith('data:'):
364
+ relationships.append(self._make_relationship(
365
+ source=doc_name,
366
+ target=self._normalize_url(url),
367
+ rel_type=RelationshipType.REFERENCES,
368
+ file_path=file_path,
369
+ line=line,
370
+ confidence=0.70
371
+ ))
372
+
373
+ def _normalize_url(self, url: str) -> str:
374
+ """Normalize URL for consistent reference."""
375
+ url = url.strip()
376
+
377
+ # Keep full URLs
378
+ if url.startswith(('http://', 'https://', '//')):
379
+ return url
380
+
381
+ # Clean relative paths
382
+ if url.startswith('./'):
383
+ url = url[2:]
384
+
385
+ # Remove query strings for local files
386
+ if '?' in url and not url.startswith(('http://', 'https://')):
387
+ url = url.split('?')[0]
388
+
389
+ return url