alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3794 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +11 -0
- alita_sdk/configurations/ado.py +148 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +76 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +81 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +323 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +72 -3
- alita_sdk/configurations/report_portal.py +115 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +167 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +101 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +21 -4
- alita_sdk/runtime/clients/client.py +458 -67
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +352 -0
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +183 -43
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +493 -105
- alita_sdk/runtime/langchain/utils.py +118 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +28 -0
- alita_sdk/runtime/toolkits/application.py +14 -4
- alita_sdk/runtime/toolkits/artifact.py +25 -9
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +782 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +314 -70
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +24 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +367 -33
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +218 -0
- alita_sdk/runtime/tools/llm.py +1032 -177
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +69 -65
- alita_sdk/runtime/tools/vectorstore_base.py +163 -90
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +48 -0
- alita_sdk/tools/__init__.py +135 -37
- alita_sdk/tools/ado/__init__.py +2 -2
- alita_sdk/tools/ado/repos/__init__.py +16 -19
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
- alita_sdk/tools/ado/test_plan/__init__.py +27 -8
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
- alita_sdk/tools/ado/wiki/__init__.py +28 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
- alita_sdk/tools/ado/work_item/__init__.py +28 -12
- alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +14 -8
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +454 -110
- alita_sdk/tools/bitbucket/__init__.py +28 -19
- alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +41 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +18 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +12 -7
- alita_sdk/tools/cloud/azure/__init__.py +12 -7
- alita_sdk/tools/cloud/gcp/__init__.py +12 -7
- alita_sdk/tools/cloud/k8s/__init__.py +12 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +21 -13
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +22 -14
- alita_sdk/tools/confluence/api_wrapper.py +197 -58
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +546 -64
- alita_sdk/tools/figma/__init__.py +60 -11
- alita_sdk/tools/figma/api_wrapper.py +1400 -167
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +18 -17
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +81 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/gitlab/__init__.py +19 -13
- alita_sdk/tools/gitlab/api_wrapper.py +256 -80
- alita_sdk/tools/gitlab_org/__init__.py +14 -10
- alita_sdk/tools/google/bigquery/__init__.py +14 -13
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +21 -11
- alita_sdk/tools/jira/__init__.py +22 -11
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +38 -14
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1357 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +40 -45
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/qtest/__init__.py +22 -14
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +13 -10
- alita_sdk/tools/report_portal/__init__.py +23 -16
- alita_sdk/tools/salesforce/__init__.py +22 -16
- alita_sdk/tools/servicenow/__init__.py +21 -16
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +17 -14
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +13 -8
- alita_sdk/tools/sql/__init__.py +22 -19
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +21 -13
- alita_sdk/tools/testrail/__init__.py +13 -11
- alita_sdk/tools/testrail/api_wrapper.py +214 -46
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +241 -55
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +18 -14
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +12 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
- alita_sdk/tools/zephyr_essential/__init__.py +16 -10
- alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +13 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
- alita_sdk/tools/zephyr_squad/__init__.py +12 -7
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
- alita_sdk-0.3.584.dist-info/RECORD +452 -0
- alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
- alita_sdk/tools/bitbucket/tools.py +0 -304
- alita_sdk-0.3.257.dist-info/RECORD +0 -343
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from .AlitaJSONLoader import AlitaJSONLoader
|
|
2
|
+
import json
|
|
3
|
+
from io import StringIO
|
|
4
|
+
from typing import List, Iterator
|
|
5
|
+
|
|
6
|
+
from langchain_core.documents import Document
|
|
7
|
+
from langchain_core.tools import ToolException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AlitaJSONLinesLoader(AlitaJSONLoader):
|
|
11
|
+
"""Load local JSONL files (one JSON object per line) using AlitaJSONLoader behavior.
|
|
12
|
+
|
|
13
|
+
Behavior:
|
|
14
|
+
- Supports both `file_path` and `file_content` (bytes or file-like object), same as AlitaJSONLoader.
|
|
15
|
+
- Treats each non-empty line as an independent JSON object.
|
|
16
|
+
- Aggregates all parsed JSON objects into a list and feeds them through the same
|
|
17
|
+
RecursiveJsonSplitter-based chunking used by AlitaJSONLoader.lazy_load.
|
|
18
|
+
- Returns a list of Documents with chunked JSON content.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
# Reuse AlitaJSONLoader initialization logic (file_path / file_content handling, encoding, etc.)
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
|
|
25
|
+
def _iter_lines(self) -> Iterator[str]:
|
|
26
|
+
"""Yield lines from file_path or file_content, mirroring AlitaJSONLoader sources."""
|
|
27
|
+
# Prefer file_path if available
|
|
28
|
+
if hasattr(self, "file_path") and self.file_path:
|
|
29
|
+
with open(self.file_path, "r", encoding=self.encoding) as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
yield line
|
|
32
|
+
# Fallback to file_content if available
|
|
33
|
+
elif hasattr(self, "file_content") and self.file_content:
|
|
34
|
+
# file_content may be bytes or a file-like object
|
|
35
|
+
if isinstance(self.file_content, (bytes, bytearray)):
|
|
36
|
+
text = self.file_content.decode(self.encoding)
|
|
37
|
+
for line in StringIO(text):
|
|
38
|
+
yield line
|
|
39
|
+
else:
|
|
40
|
+
# Assume it's a text file-like object positioned at the beginning
|
|
41
|
+
self.file_content.seek(0)
|
|
42
|
+
for line in self.file_content:
|
|
43
|
+
yield line
|
|
44
|
+
else:
|
|
45
|
+
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
|
46
|
+
|
|
47
|
+
def load(self) -> List[Document]: # type: ignore[override]
|
|
48
|
+
"""Load JSONL content by delegating each non-empty line to AlitaJSONLoader.
|
|
49
|
+
|
|
50
|
+
For each non-empty line in the underlying source (file_path or file_content):
|
|
51
|
+
- Create a temporary AlitaJSONLoader instance with that line as file_content.
|
|
52
|
+
- Call lazy_load() on that instance to apply the same RecursiveJsonSplitter logic
|
|
53
|
+
as for a normal JSON file.
|
|
54
|
+
- Accumulate all Documents from all lines and return them as a single list.
|
|
55
|
+
"""
|
|
56
|
+
docs: List[Document] = []
|
|
57
|
+
|
|
58
|
+
for raw_line in self._iter_lines():
|
|
59
|
+
line = raw_line.strip()
|
|
60
|
+
if not line:
|
|
61
|
+
continue
|
|
62
|
+
try:
|
|
63
|
+
# Instantiate a per-line AlitaJSONLoader using the same configuration
|
|
64
|
+
line_loader = AlitaJSONLoader(
|
|
65
|
+
file_content=line,
|
|
66
|
+
file_name=getattr(self, "file_name", str(getattr(self, "file_path", "no_name"))),
|
|
67
|
+
encoding=self.encoding,
|
|
68
|
+
autodetect_encoding=self.autodetect_encoding,
|
|
69
|
+
max_tokens=self.max_tokens,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for doc in line_loader.lazy_load():
|
|
73
|
+
docs.append(doc)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise ToolException(f"Error processing JSONL line: {line[:100]}... Error: {e}") from e
|
|
76
|
+
|
|
77
|
+
return docs
|
|
@@ -30,7 +30,12 @@ class AlitaJSONLoader(BaseLoader):
|
|
|
30
30
|
with open(self.file_path, encoding=self.encoding) as f:
|
|
31
31
|
return json.load(f)
|
|
32
32
|
elif hasattr(self, 'file_content') and self.file_content:
|
|
33
|
-
|
|
33
|
+
if isinstance(self.file_content, bytes):
|
|
34
|
+
return json.loads(self.file_content.decode(self.encoding))
|
|
35
|
+
elif isinstance(self.file_content, str):
|
|
36
|
+
return json.loads(self.file_content)
|
|
37
|
+
else:
|
|
38
|
+
return json.load(self.file_content)
|
|
34
39
|
else:
|
|
35
40
|
raise ValueError("Neither file_path nor file_content is provided.")
|
|
36
41
|
|
|
@@ -42,7 +47,6 @@ class AlitaJSONLoader(BaseLoader):
|
|
|
42
47
|
try:
|
|
43
48
|
with open(self.file_path, encoding=encoding.encoding) as f:
|
|
44
49
|
return f.read()
|
|
45
|
-
break
|
|
46
50
|
except UnicodeDecodeError:
|
|
47
51
|
continue
|
|
48
52
|
elif hasattr(self, 'file_content') and self.file_content:
|
|
@@ -68,6 +72,9 @@ class AlitaJSONLoader(BaseLoader):
|
|
|
68
72
|
else:
|
|
69
73
|
data_dict = content_json
|
|
70
74
|
chunks = RecursiveJsonSplitter(max_chunk_size=self.max_tokens).split_json(json_data=data_dict)
|
|
75
|
+
chunk_id = 1
|
|
71
76
|
for chunk in chunks:
|
|
72
|
-
metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name
|
|
77
|
+
metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name,
|
|
78
|
+
"chunk_id": chunk_id}
|
|
79
|
+
chunk_id+=1
|
|
73
80
|
yield Document(page_content=json.dumps(chunk), metadata=metadata)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, List, Union, Generator, Iterator
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
|
|
5
|
+
from langchain_community.document_loaders.unstructured import (
|
|
6
|
+
UnstructuredFileLoader,
|
|
7
|
+
validate_unstructured_version,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
class AlitaMarkdownLoader(UnstructuredFileLoader):
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
file_path: Union[str, Path],
|
|
15
|
+
mode: str = "elements",
|
|
16
|
+
chunker_config: dict = None,
|
|
17
|
+
**unstructured_kwargs: Any,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Args:
|
|
21
|
+
file_path: The path to the Markdown file to load.
|
|
22
|
+
mode: The mode to use when loading the file. Can be one of "single",
|
|
23
|
+
"multi", or "all". Default is "single".
|
|
24
|
+
chunker_config: Configuration dictionary for the markdown chunker.
|
|
25
|
+
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
|
26
|
+
"""
|
|
27
|
+
file_path = str(file_path)
|
|
28
|
+
validate_unstructured_version("0.4.16")
|
|
29
|
+
self.chunker_config = chunker_config or {
|
|
30
|
+
"strip_header": False,
|
|
31
|
+
"return_each_line": False,
|
|
32
|
+
"headers_to_split_on": [],
|
|
33
|
+
"max_tokens": 512,
|
|
34
|
+
"token_overlap": 10,
|
|
35
|
+
}
|
|
36
|
+
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
|
37
|
+
|
|
38
|
+
def _file_content_generator(self) -> Generator[Document, None, None]:
|
|
39
|
+
"""
|
|
40
|
+
Creates a generator that yields a single Document object
|
|
41
|
+
representing the entire content of the Markdown file.
|
|
42
|
+
"""
|
|
43
|
+
with open(self.file_path, "r", encoding="utf-8") as file:
|
|
44
|
+
content = file.read()
|
|
45
|
+
yield Document(page_content=content, metadata={"source": self.file_path})
|
|
46
|
+
|
|
47
|
+
def _get_elements(self) -> List[Document]:
|
|
48
|
+
"""
|
|
49
|
+
Processes the Markdown file using the markdown_chunker and returns the chunks.
|
|
50
|
+
"""
|
|
51
|
+
from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_chunker
|
|
52
|
+
|
|
53
|
+
# Create a generator for the file content
|
|
54
|
+
file_content_generator = self._file_content_generator()
|
|
55
|
+
|
|
56
|
+
# Use the markdown_chunker to process the content
|
|
57
|
+
chunks = markdown_chunker(file_content_generator, config=self.chunker_config)
|
|
58
|
+
|
|
59
|
+
# Convert the generator to a list of Document objects
|
|
60
|
+
return list(chunks)
|
|
61
|
+
|
|
62
|
+
def lazy_load(self) -> Iterator[Document]:
|
|
63
|
+
"""Load file."""
|
|
64
|
+
elements = self._get_elements()
|
|
65
|
+
self._post_process_elements(elements)
|
|
66
|
+
yield from elements
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import pymupdf
|
|
2
|
-
|
|
2
|
+
import fitz
|
|
3
|
+
from langchain_community.document_loaders import PyPDFium2Loader
|
|
4
|
+
|
|
5
|
+
from .ImageParser import ImageParser
|
|
3
6
|
from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
|
|
4
7
|
from langchain_core.tools import ToolException
|
|
5
8
|
|
|
@@ -20,6 +23,7 @@ class AlitaPDFLoader:
|
|
|
20
23
|
self.headers = kwargs.get('headers', None)
|
|
21
24
|
self.extraction_mode = kwargs.get('extraction_mode', "plain")
|
|
22
25
|
self.extraction_kwargs = kwargs.get('extraction_kwargs', None)
|
|
26
|
+
self.images_parser=ImageParser(llm=self.llm, prompt=self.prompt)
|
|
23
27
|
|
|
24
28
|
def get_content(self):
|
|
25
29
|
if hasattr(self, 'file_path'):
|
|
@@ -41,8 +45,59 @@ class AlitaPDFLoader:
|
|
|
41
45
|
return text_content
|
|
42
46
|
|
|
43
47
|
def read_pdf_page(self, report, page, index):
|
|
44
|
-
|
|
45
|
-
|
|
48
|
+
# Extract text in block format (to more accurately match hyperlinks to text)
|
|
49
|
+
text_blocks = page.get_text("blocks") # Returns a list of text blocks
|
|
50
|
+
words = page.get_text("words") # Returns words with their coordinates
|
|
51
|
+
|
|
52
|
+
# Extract hyperlinks
|
|
53
|
+
links = page.get_links()
|
|
54
|
+
|
|
55
|
+
# Create a list to store the modified text
|
|
56
|
+
modified_text = []
|
|
57
|
+
|
|
58
|
+
for block in text_blocks:
|
|
59
|
+
block_rect = fitz.Rect(block[:4]) # Coordinates of the text block
|
|
60
|
+
block_text = block[4] # The actual text of the block
|
|
61
|
+
|
|
62
|
+
# Check if there are hyperlinks intersecting with this text block
|
|
63
|
+
for link in links:
|
|
64
|
+
if "uri" in link: # Ensure this is a hyperlink
|
|
65
|
+
link_rect = link["from"] # Coordinates of the hyperlink area
|
|
66
|
+
link_uri = link["uri"] # The URL of the hyperlink
|
|
67
|
+
|
|
68
|
+
# Expand the hyperlink area slightly to account for inaccuracies
|
|
69
|
+
link_rect = fitz.Rect(
|
|
70
|
+
link_rect.x0 - 1, link_rect.y0 - 1, link_rect.x1 + 1, link_rect.y1 + 1
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Find words that are inside the hyperlink area
|
|
74
|
+
link_text = []
|
|
75
|
+
for word in words:
|
|
76
|
+
word_rect = fitz.Rect(word[:4]) # Coordinates of the word
|
|
77
|
+
word_text = word[4]
|
|
78
|
+
|
|
79
|
+
# Check if the word rectangle is fully inside the hyperlink rectangle
|
|
80
|
+
if link_rect.contains(word_rect):
|
|
81
|
+
link_text.append(word_text)
|
|
82
|
+
# If the word partially intersects, check vertical alignment
|
|
83
|
+
elif link_rect.intersects(word_rect):
|
|
84
|
+
# Condition: The word must be on the same line as the hyperlink
|
|
85
|
+
if abs(link_rect.y0 - word_rect.y0) < 2 and abs(link_rect.y1 - word_rect.y1) < 2:
|
|
86
|
+
link_text.append(word_text)
|
|
87
|
+
|
|
88
|
+
# Format the hyperlink in Markdown
|
|
89
|
+
full_text = " ".join(link_text) if link_text else "No text"
|
|
90
|
+
hyperlink = f"[{full_text}]({link_uri})"
|
|
91
|
+
|
|
92
|
+
# Replace the hyperlink text in the block with the formatted hyperlink
|
|
93
|
+
block_text = block_text.replace(full_text, hyperlink)
|
|
94
|
+
|
|
95
|
+
# Add the processed text block to the result
|
|
96
|
+
modified_text.append(block_text)
|
|
97
|
+
|
|
98
|
+
# Combine all text blocks into the final text for the page
|
|
99
|
+
text_content = f'Page: {index}\n' + "\n".join(modified_text)
|
|
100
|
+
|
|
46
101
|
if self.extract_images:
|
|
47
102
|
images = page.get_images(full=True)
|
|
48
103
|
for i, img in enumerate(images):
|
|
@@ -54,10 +109,24 @@ class AlitaPDFLoader:
|
|
|
54
109
|
|
|
55
110
|
def load(self):
|
|
56
111
|
if not hasattr(self, 'file_path'):
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
112
|
+
import tempfile
|
|
113
|
+
|
|
114
|
+
with tempfile.NamedTemporaryFile(mode='w+b', delete=True, suffix=".pdf") as temp_file:
|
|
115
|
+
temp_file.write(self.file_content)
|
|
116
|
+
temp_file.flush()
|
|
117
|
+
self.file_path = temp_file.name
|
|
118
|
+
return self._load_docs()
|
|
119
|
+
else:
|
|
120
|
+
return self._load_docs()
|
|
121
|
+
|
|
122
|
+
def _load_docs(self):
|
|
123
|
+
docs = PyPDFium2Loader(
|
|
124
|
+
file_path = self.file_path,
|
|
125
|
+
password=self.password,
|
|
126
|
+
headers=self.headers,
|
|
127
|
+
extract_images = self.extract_images,
|
|
128
|
+
images_parser = ImageParser(llm=self.llm, prompt=self.prompt),
|
|
129
|
+
).load()
|
|
130
|
+
for doc in docs:
|
|
131
|
+
doc.metadata['chunk_id'] = doc.metadata['page']
|
|
132
|
+
return docs
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import io
|
|
2
2
|
|
|
3
|
-
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
|
4
3
|
from langchain_core.tools import ToolException
|
|
5
4
|
from pptx import Presentation
|
|
6
5
|
from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
|
|
7
6
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
8
|
|
|
9
9
|
class AlitaPowerPointLoader:
|
|
10
10
|
|
|
@@ -22,33 +22,70 @@ class AlitaPowerPointLoader:
|
|
|
22
22
|
self.extract_images = unstructured_kwargs.get('extract_images', False)
|
|
23
23
|
self.llm = unstructured_kwargs.get('llm', None)
|
|
24
24
|
self.prompt = unstructured_kwargs.get('prompt', "Describe image")
|
|
25
|
+
self.pages_per_chunk = unstructured_kwargs.get('pages_per_chunk', 5)
|
|
25
26
|
|
|
26
27
|
def get_content(self):
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
if hasattr(self, 'file_path'):
|
|
29
|
+
with open(self.file_path, 'rb') as f:
|
|
30
|
+
prs = Presentation(f)
|
|
31
|
+
elif hasattr(self, 'file_content'):
|
|
32
|
+
prs = Presentation(io.BytesIO(self.file_content))
|
|
33
|
+
pages = []
|
|
29
34
|
if self.page_number is not None:
|
|
30
|
-
|
|
35
|
+
pages.append(self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number))
|
|
31
36
|
else:
|
|
32
37
|
for index, slide in enumerate(prs.slides, start=1):
|
|
33
|
-
|
|
34
|
-
|
|
38
|
+
pages.append(self.read_pptx_slide(slide, index))
|
|
39
|
+
if self.mode == 'single':
|
|
40
|
+
return "\n".join(pages)
|
|
41
|
+
if self.mode == 'paged':
|
|
42
|
+
return pages
|
|
43
|
+
else:
|
|
44
|
+
raise ToolException(f"Unknown mode value: {self.mode}. Only 'single', 'paged' values allowed.")
|
|
35
45
|
|
|
36
46
|
def read_pptx_slide(self, slide, index):
|
|
37
47
|
text_content = f'Slide: {index}\n'
|
|
38
48
|
for shape in slide.shapes:
|
|
39
|
-
if hasattr(shape, "
|
|
40
|
-
|
|
49
|
+
if hasattr(shape, "text_frame") and shape.text_frame is not None:
|
|
50
|
+
for paragraph in shape.text_frame.paragraphs:
|
|
51
|
+
for run in paragraph.runs:
|
|
52
|
+
if run.hyperlink and run.hyperlink.address:
|
|
53
|
+
link_text = run.text.strip() or "Link"
|
|
54
|
+
link_url = run.hyperlink.address
|
|
55
|
+
text_content += f" [{link_text}]({link_url}) "
|
|
56
|
+
else:
|
|
57
|
+
text_content += run.text
|
|
58
|
+
text_content += "\n"
|
|
41
59
|
elif self.extract_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
42
60
|
try:
|
|
43
|
-
caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm)
|
|
61
|
+
caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm, self.prompt)
|
|
44
62
|
except:
|
|
45
63
|
caption = "unknown"
|
|
46
64
|
text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
|
|
47
|
-
return text_content
|
|
65
|
+
return text_content + "\n"
|
|
48
66
|
|
|
49
67
|
def load(self):
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
68
|
+
content = self.get_content()
|
|
69
|
+
if isinstance(content, str):
|
|
70
|
+
yield Document(page_content=content, metadata={})
|
|
71
|
+
elif isinstance(content, list):
|
|
72
|
+
chunk = []
|
|
73
|
+
chunk_count = 0
|
|
74
|
+
for page_number, page in enumerate(content, start=1):
|
|
75
|
+
chunk.append(page)
|
|
76
|
+
if len(chunk) == self.pages_per_chunk:
|
|
77
|
+
chunk_content = "\n".join(chunk)
|
|
78
|
+
yield Document(
|
|
79
|
+
page_content=chunk_content,
|
|
80
|
+
metadata={"chunk_number": chunk_count + 1,
|
|
81
|
+
"pages_in_chunk": list(range(page_number - len(chunk) + 1, page_number + 1))}
|
|
82
|
+
)
|
|
83
|
+
chunk = []
|
|
84
|
+
chunk_count += 1
|
|
85
|
+
if chunk:
|
|
86
|
+
chunk_content = "\n".join(chunk)
|
|
87
|
+
yield Document(
|
|
88
|
+
page_content=chunk_content,
|
|
89
|
+
metadata={"chunk_number": chunk_count + 1,
|
|
90
|
+
"pages_in_chunk": list(range(len(content) - len(chunk) + 1, len(content) + 1))}
|
|
91
|
+
)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Union
|
|
3
|
+
from langchain_community.document_loaders.python import PythonLoader
|
|
4
|
+
|
|
5
|
+
class AlitaPythonLoader(PythonLoader):
|
|
6
|
+
"""Load `Python` files, respecting any non-default encoding if specified."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, file_path: Union[str, Path], **kwargs):
|
|
9
|
+
super().__init__(file_path)
|
|
@@ -17,8 +17,6 @@ from langchain_core.documents import Document
|
|
|
17
17
|
from typing import List, Optional, Iterator
|
|
18
18
|
from json import dumps
|
|
19
19
|
from .utils import cleanse_data
|
|
20
|
-
from ..tools.log import print_log
|
|
21
|
-
|
|
22
20
|
|
|
23
21
|
class AlitaTableLoader(BaseLoader):
|
|
24
22
|
def __init__(self,
|
|
@@ -65,7 +63,7 @@ class AlitaTableLoader(BaseLoader):
|
|
|
65
63
|
"source": f'{self.file_path}:{idx+1}',
|
|
66
64
|
"table_source": self.file_path,
|
|
67
65
|
}
|
|
68
|
-
if len(docs) == 0:
|
|
66
|
+
if len(docs) == 0 and not self.raw_content:
|
|
69
67
|
header_metadata = metadata.copy()
|
|
70
68
|
header_metadata["header"] = "true"
|
|
71
69
|
header = "\t".join([str(value) for value in row.keys()])
|
|
@@ -74,7 +72,6 @@ class AlitaTableLoader(BaseLoader):
|
|
|
74
72
|
docs.append(Document(page_content=row, metadata=metadata))
|
|
75
73
|
continue
|
|
76
74
|
if self.json_documents:
|
|
77
|
-
# print_log(row)
|
|
78
75
|
metadata['columns'] = list(row.keys())
|
|
79
76
|
metadata['og_data'] = dumps(row)
|
|
80
77
|
docs.append(Document(page_content=self.row_processor(row), metadata=metadata))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Iterator
|
|
1
|
+
from typing import Iterator, Generator
|
|
2
2
|
|
|
3
3
|
from langchain_core.documents import Document
|
|
4
4
|
|
|
@@ -6,6 +6,9 @@ from langchain_community.document_loaders.base import BaseLoader
|
|
|
6
6
|
from langchain_community.document_loaders.helpers import detect_file_encodings
|
|
7
7
|
from langchain_core.tools import ToolException
|
|
8
8
|
|
|
9
|
+
from alita_sdk.tools.chunkers import markdown_chunker
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
class AlitaTextLoader(BaseLoader):
|
|
10
13
|
|
|
11
14
|
def __init__(self, **kwargs):
|
|
@@ -19,6 +22,8 @@ class AlitaTextLoader(BaseLoader):
|
|
|
19
22
|
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
|
20
23
|
self.encoding = kwargs.get('encoding', 'utf-8')
|
|
21
24
|
self.autodetect_encoding = kwargs.get('autodetect_encoding', False)
|
|
25
|
+
self.max_tokens=kwargs.get('max_tokens', 1024)
|
|
26
|
+
self.token_overlap = kwargs.get('token_overlap', 10)
|
|
22
27
|
|
|
23
28
|
def get_content(self):
|
|
24
29
|
text = ""
|
|
@@ -59,8 +64,16 @@ class AlitaTextLoader(BaseLoader):
|
|
|
59
64
|
|
|
60
65
|
return text
|
|
61
66
|
|
|
67
|
+
def generate_document(self, text, metadata) -> Generator[Document, None, None]:
|
|
68
|
+
yield Document(page_content=text, metadata=metadata)
|
|
69
|
+
|
|
62
70
|
def lazy_load(self) -> Iterator[Document]:
|
|
63
71
|
"""Load from file path."""
|
|
64
72
|
text = self.get_content()
|
|
65
73
|
metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name}
|
|
66
|
-
|
|
74
|
+
chunks = markdown_chunker(file_content_generator=self.generate_document(text, metadata),
|
|
75
|
+
config={
|
|
76
|
+
"max_tokens": self.max_tokens,
|
|
77
|
+
"token_overlap": self.token_overlap
|
|
78
|
+
})
|
|
79
|
+
yield from chunks
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_core.documents.base import Blob
|
|
6
|
+
|
|
7
|
+
from alita_sdk.runtime.langchain.document_loaders.AlitaImageLoader import AlitaImageLoader
|
|
8
|
+
|
|
9
|
+
class ImageParser(BaseImageBlobParser):
|
|
10
|
+
|
|
11
|
+
def __init__(self, **kwargs):
|
|
12
|
+
self.llm = kwargs.get('llm')
|
|
13
|
+
self.prompt = kwargs.get('prompt')
|
|
14
|
+
|
|
15
|
+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
16
|
+
try:
|
|
17
|
+
yield from super().lazy_parse(blob)
|
|
18
|
+
except Exception:
|
|
19
|
+
yield Document(page_content="[Image: Unknown]")
|
|
20
|
+
|
|
21
|
+
def _analyze_image(self, img) -> str:
|
|
22
|
+
from io import BytesIO
|
|
23
|
+
|
|
24
|
+
byte_stream = BytesIO()
|
|
25
|
+
img.save(byte_stream, format='PNG')
|
|
26
|
+
image_bytes = byte_stream.getvalue()
|
|
27
|
+
try:
|
|
28
|
+
return AlitaImageLoader(file_content=image_bytes, file_name="image.png", prompt=self.prompt, llm=self.llm).get_content()
|
|
29
|
+
except Exception:
|
|
30
|
+
return "Image: unknown"
|