alita-sdk 0.3.379__py3-none-any.whl → 0.3.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +156 -0
- alita_sdk/cli/agent_loader.py +245 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3113 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +1 -1
- alita_sdk/configurations/ado.py +141 -20
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/figma.py +76 -0
- alita_sdk/configurations/gitlab.py +91 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +329 -0
- alita_sdk/configurations/qtest.py +72 -1
- alita_sdk/configurations/report_portal.py +96 -0
- alita_sdk/configurations/sharepoint.py +148 -0
- alita_sdk/configurations/testio.py +83 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/artifact.py +3 -3
- alita_sdk/runtime/clients/client.py +388 -46
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +8 -21
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +157 -39
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
- alita_sdk/runtime/langchain/document_loaders/constants.py +40 -19
- alita_sdk/runtime/langchain/langraph_agent.py +405 -84
- alita_sdk/runtime/langchain/utils.py +106 -7
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +31 -0
- alita_sdk/runtime/toolkits/application.py +29 -10
- alita_sdk/runtime/toolkits/artifact.py +20 -11
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +783 -0
- alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +251 -6
- alita_sdk/runtime/toolkits/tools.py +356 -69
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +10 -3
- alita_sdk/runtime/tools/application.py +27 -6
- alita_sdk/runtime/tools/artifact.py +511 -28
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +67 -35
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +148 -46
- alita_sdk/runtime/tools/llm.py +1003 -128
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +8 -5
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -4
- alita_sdk/runtime/tools/sandbox.py +65 -48
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +9 -3
- alita_sdk/runtime/tools/vectorstore_base.py +70 -14
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/serialization.py +155 -0
- alita_sdk/runtime/utils/streamlit.py +40 -13
- alita_sdk/runtime/utils/toolkit_utils.py +30 -9
- alita_sdk/runtime/utils/utils.py +36 -0
- alita_sdk/tools/__init__.py +134 -35
- alita_sdk/tools/ado/repos/__init__.py +51 -32
- alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
- alita_sdk/tools/ado/test_plan/__init__.py +25 -9
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
- alita_sdk/tools/ado/utils.py +1 -18
- alita_sdk/tools/ado/wiki/__init__.py +25 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
- alita_sdk/tools/ado/work_item/__init__.py +26 -13
- alita_sdk/tools/ado/work_item/ado_wrapper.py +73 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +11 -8
- alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +271 -84
- alita_sdk/tools/bitbucket/__init__.py +17 -11
- alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/browser/__init__.py +5 -4
- alita_sdk/tools/carrier/__init__.py +5 -6
- alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
- alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
- alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +10 -7
- alita_sdk/tools/cloud/azure/__init__.py +10 -7
- alita_sdk/tools/cloud/gcp/__init__.py +10 -7
- alita_sdk/tools/cloud/k8s/__init__.py +10 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +11 -8
- alita_sdk/tools/code_indexer_toolkit.py +82 -22
- alita_sdk/tools/confluence/__init__.py +22 -16
- alita_sdk/tools/confluence/api_wrapper.py +107 -30
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +493 -30
- alita_sdk/tools/figma/__init__.py +58 -11
- alita_sdk/tools/figma/api_wrapper.py +1235 -143
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +14 -15
- alita_sdk/tools/github/github_client.py +224 -100
- alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
- alita_sdk/tools/github/schemas.py +14 -5
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/github/tool_prompts.py +9 -22
- alita_sdk/tools/gitlab/__init__.py +16 -11
- alita_sdk/tools/gitlab/api_wrapper.py +218 -48
- alita_sdk/tools/gitlab_org/__init__.py +10 -9
- alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
- alita_sdk/tools/google/bigquery/__init__.py +13 -12
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +11 -8
- alita_sdk/tools/google_places/api_wrapper.py +1 -1
- alita_sdk/tools/jira/__init__.py +17 -10
- alita_sdk/tools/jira/api_wrapper.py +92 -41
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +12 -4
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1368 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +38 -25
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +10 -9
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
- alita_sdk/tools/qtest/__init__.py +31 -11
- alita_sdk/tools/qtest/api_wrapper.py +2135 -86
- alita_sdk/tools/rally/__init__.py +10 -9
- alita_sdk/tools/rally/api_wrapper.py +1 -1
- alita_sdk/tools/report_portal/__init__.py +12 -8
- alita_sdk/tools/salesforce/__init__.py +10 -8
- alita_sdk/tools/servicenow/__init__.py +17 -15
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +10 -7
- alita_sdk/tools/sharepoint/api_wrapper.py +129 -38
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +10 -7
- alita_sdk/tools/slack/api_wrapper.py +2 -2
- alita_sdk/tools/sql/__init__.py +12 -9
- alita_sdk/tools/testio/__init__.py +10 -7
- alita_sdk/tools/testrail/__init__.py +11 -10
- alita_sdk/tools/testrail/api_wrapper.py +1 -1
- alita_sdk/tools/utils/__init__.py +9 -4
- alita_sdk/tools/utils/content_parser.py +103 -18
- alita_sdk/tools/utils/text_operations.py +410 -0
- alita_sdk/tools/utils/tool_prompts.py +79 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +30 -13
- alita_sdk/tools/xray/__init__.py +13 -9
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +10 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +11 -7
- alita_sdk/tools/zephyr_essential/__init__.py +10 -7
- alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
- alita_sdk/tools/zephyr_essential/client.py +2 -2
- alita_sdk/tools/zephyr_scale/__init__.py +11 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
- alita_sdk/tools/zephyr_squad/__init__.py +10 -7
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +154 -8
- alita_sdk-0.3.627.dist-info/RECORD +468 -0
- alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.379.dist-info/RECORD +0 -360
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
+
import uuid
|
|
2
3
|
from io import BytesIO
|
|
3
4
|
|
|
4
5
|
import mammoth.images
|
|
@@ -8,6 +9,9 @@ from langchain_core.document_loaders import BaseLoader
|
|
|
8
9
|
from langchain_core.documents import Document
|
|
9
10
|
from mammoth import convert_to_html
|
|
10
11
|
from markdownify import markdownify
|
|
12
|
+
from docx import Document as DocxDocument
|
|
13
|
+
from docx.oxml.ns import qn
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
11
15
|
|
|
12
16
|
from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_by_headers_chunker
|
|
13
17
|
from .utils import perform_llm_prediction_for_image_bytes
|
|
@@ -17,6 +21,7 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
17
21
|
"""
|
|
18
22
|
Loader for Docx files using Mammoth to convert to HTML, with image handling,
|
|
19
23
|
and then Markdownify to convert HTML to markdown.
|
|
24
|
+
Detects bordered paragraphs and text boxes and treats them as code blocks.
|
|
20
25
|
"""
|
|
21
26
|
def __init__(self, **kwargs):
|
|
22
27
|
"""
|
|
@@ -97,6 +102,295 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
97
102
|
new_md = pattern.sub(replace_placeholder, original_md)
|
|
98
103
|
return new_md
|
|
99
104
|
|
|
105
|
+
def __has_border(self, paragraph):
|
|
106
|
+
"""
|
|
107
|
+
Check if a paragraph has border formatting.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
paragraph: A python-docx Paragraph object.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
bool: True if paragraph has any border, False otherwise.
|
|
114
|
+
"""
|
|
115
|
+
pPr = paragraph._element.pPr
|
|
116
|
+
if pPr is not None:
|
|
117
|
+
pBdr = pPr.find(qn('w:pBdr'))
|
|
118
|
+
if pBdr is not None:
|
|
119
|
+
# Check if any border side exists (top, bottom, left, right)
|
|
120
|
+
for side in ['top', 'bottom', 'left', 'right']:
|
|
121
|
+
border = pBdr.find(qn(f'w:{side}'))
|
|
122
|
+
if border is not None:
|
|
123
|
+
# Check if border is not "none" or has a width
|
|
124
|
+
val = border.get(qn('w:val'))
|
|
125
|
+
if val and val != 'none':
|
|
126
|
+
return True
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
def __find_text_boxes(self, doc):
|
|
130
|
+
"""
|
|
131
|
+
Find all text boxes in document by searching OOXML structure.
|
|
132
|
+
Text boxes are typically in w:txbxContent elements.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
doc: A python-docx Document object.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
list: List of tuples (element, paragraphs_inside_textbox).
|
|
139
|
+
"""
|
|
140
|
+
text_boxes = []
|
|
141
|
+
|
|
142
|
+
# Iterate through document body XML to find text box content elements
|
|
143
|
+
for element in doc.element.body.iter():
|
|
144
|
+
# Look for text box content elements
|
|
145
|
+
if element.tag.endswith('txbxContent'):
|
|
146
|
+
# Collect all paragraphs inside this text box
|
|
147
|
+
txbx_paragraphs = []
|
|
148
|
+
for txbx_para_element in element.iter():
|
|
149
|
+
if txbx_para_element.tag.endswith('p'):
|
|
150
|
+
txbx_paragraphs.append(txbx_para_element)
|
|
151
|
+
|
|
152
|
+
if txbx_paragraphs:
|
|
153
|
+
text_boxes.append((element, txbx_paragraphs))
|
|
154
|
+
|
|
155
|
+
return text_boxes
|
|
156
|
+
|
|
157
|
+
def __create_marker_paragraph(self, marker_text):
|
|
158
|
+
"""
|
|
159
|
+
Create a paragraph element with marker text.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
marker_text (str): The marker text to insert.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Element: An OOXML paragraph element.
|
|
166
|
+
"""
|
|
167
|
+
from docx.oxml import OxmlElement
|
|
168
|
+
|
|
169
|
+
p = OxmlElement('w:p')
|
|
170
|
+
r = OxmlElement('w:r')
|
|
171
|
+
t = OxmlElement('w:t')
|
|
172
|
+
t.text = marker_text
|
|
173
|
+
r.append(t)
|
|
174
|
+
p.append(r)
|
|
175
|
+
return p
|
|
176
|
+
|
|
177
|
+
def __inject_markers_for_paragraph(self, paragraph, start_marker, end_marker):
|
|
178
|
+
"""
|
|
179
|
+
Inject marker paragraphs before and after a bordered paragraph.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
paragraph: A python-docx Paragraph object.
|
|
183
|
+
start_marker (str): The start marker text.
|
|
184
|
+
end_marker (str): The end marker text.
|
|
185
|
+
"""
|
|
186
|
+
# Insert start marker paragraph before
|
|
187
|
+
marker_p_start = self.__create_marker_paragraph(start_marker)
|
|
188
|
+
paragraph._element.addprevious(marker_p_start)
|
|
189
|
+
|
|
190
|
+
# Insert end marker paragraph after
|
|
191
|
+
marker_p_end = self.__create_marker_paragraph(end_marker)
|
|
192
|
+
paragraph._element.addnext(marker_p_end)
|
|
193
|
+
|
|
194
|
+
def __inject_markers_for_textbox(self, textbox_element, paragraph_elements, start_marker, end_marker):
|
|
195
|
+
"""
|
|
196
|
+
Inject markers around text box content.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
textbox_element: The w:txbxContent element.
|
|
200
|
+
paragraph_elements: List of paragraph elements inside the text box.
|
|
201
|
+
start_marker (str): The start marker text.
|
|
202
|
+
end_marker (str): The end marker text.
|
|
203
|
+
"""
|
|
204
|
+
if not paragraph_elements:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
# Insert start marker before first paragraph in text box
|
|
208
|
+
first_para = paragraph_elements[0]
|
|
209
|
+
marker_p_start = self.__create_marker_paragraph(start_marker)
|
|
210
|
+
first_para.addprevious(marker_p_start)
|
|
211
|
+
|
|
212
|
+
# Insert end marker after last paragraph in text box
|
|
213
|
+
last_para = paragraph_elements[-1]
|
|
214
|
+
marker_p_end = self.__create_marker_paragraph(end_marker)
|
|
215
|
+
last_para.addnext(marker_p_end)
|
|
216
|
+
|
|
217
|
+
def __detect_and_mark_bordered_content(self, docx_stream):
|
|
218
|
+
"""
|
|
219
|
+
Detects bordered paragraphs and text boxes, injects unique markers around them.
|
|
220
|
+
Groups consecutive bordered paragraphs into single code blocks.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
docx_stream: A file-like object containing the DOCX document.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
tuple: (modified_docx_stream, start_marker, end_marker)
|
|
227
|
+
"""
|
|
228
|
+
# Load document with python-docx
|
|
229
|
+
doc = DocxDocument(docx_stream)
|
|
230
|
+
|
|
231
|
+
# Generate unique markers to avoid conflicts with document content
|
|
232
|
+
unique_id = uuid.uuid4().hex[:8]
|
|
233
|
+
start_marker = f"<<<BORDERED_BLOCK_START_{unique_id}>>>"
|
|
234
|
+
end_marker = f"<<<BORDERED_BLOCK_END_{unique_id}>>>"
|
|
235
|
+
|
|
236
|
+
# Group consecutive bordered paragraphs together
|
|
237
|
+
bordered_groups = []
|
|
238
|
+
current_group = []
|
|
239
|
+
|
|
240
|
+
for para in doc.paragraphs:
|
|
241
|
+
if self.__has_border(para):
|
|
242
|
+
current_group.append(para)
|
|
243
|
+
else:
|
|
244
|
+
if current_group:
|
|
245
|
+
# End of a bordered group
|
|
246
|
+
bordered_groups.append(current_group)
|
|
247
|
+
current_group = []
|
|
248
|
+
|
|
249
|
+
# Don't forget the last group if document ends with bordered paragraphs
|
|
250
|
+
if current_group:
|
|
251
|
+
bordered_groups.append(current_group)
|
|
252
|
+
|
|
253
|
+
# Collect all text boxes
|
|
254
|
+
# text_boxes = self.__find_text_boxes(doc)
|
|
255
|
+
|
|
256
|
+
# Inject markers around each group of consecutive bordered paragraphs
|
|
257
|
+
for group in bordered_groups:
|
|
258
|
+
if group:
|
|
259
|
+
# Add start marker before first paragraph in group
|
|
260
|
+
first_para = group[0]
|
|
261
|
+
marker_p_start = self.__create_marker_paragraph(start_marker)
|
|
262
|
+
first_para._element.addprevious(marker_p_start)
|
|
263
|
+
|
|
264
|
+
# Add end marker after last paragraph in group
|
|
265
|
+
last_para = group[-1]
|
|
266
|
+
marker_p_end = self.__create_marker_paragraph(end_marker)
|
|
267
|
+
last_para._element.addnext(marker_p_end)
|
|
268
|
+
|
|
269
|
+
# Inject markers around text box content
|
|
270
|
+
# for textbox_element, para_elements in text_boxes:
|
|
271
|
+
# self.__inject_markers_for_textbox(textbox_element, para_elements, start_marker, end_marker)
|
|
272
|
+
|
|
273
|
+
# Save modified document to BytesIO
|
|
274
|
+
output = BytesIO()
|
|
275
|
+
doc.save(output)
|
|
276
|
+
output.seek(0)
|
|
277
|
+
|
|
278
|
+
return output, start_marker, end_marker
|
|
279
|
+
|
|
280
|
+
def __contains_complex_structure(self, content_html):
|
|
281
|
+
"""
|
|
282
|
+
Check if HTML content contains tables, lists, or other complex structures.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
content_html (str): HTML content to analyze.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
bool: True if content contains tables/lists, False otherwise.
|
|
289
|
+
"""
|
|
290
|
+
content_soup = BeautifulSoup(content_html, 'html.parser')
|
|
291
|
+
|
|
292
|
+
# Check for tables
|
|
293
|
+
if content_soup.find('table'):
|
|
294
|
+
return True
|
|
295
|
+
|
|
296
|
+
# Check for lists (ul, ol)
|
|
297
|
+
if content_soup.find('ul') or content_soup.find('ol'):
|
|
298
|
+
return True
|
|
299
|
+
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
def __escape_hash_symbols(self, html_content):
|
|
303
|
+
"""
|
|
304
|
+
Escape hash (#) symbols at the beginning of lines in HTML to prevent
|
|
305
|
+
them from being treated as markdown headers.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
html_content (str): HTML content.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
str: HTML with escaped hash symbols.
|
|
312
|
+
"""
|
|
313
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
|
314
|
+
|
|
315
|
+
# Process all text-containing elements
|
|
316
|
+
for element in soup.find_all(['p', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
317
|
+
if element.string:
|
|
318
|
+
text = element.string
|
|
319
|
+
# If line starts with #, escape it
|
|
320
|
+
if text.strip().startswith('#'):
|
|
321
|
+
element.string = text.replace('#', '\\#', 1)
|
|
322
|
+
|
|
323
|
+
return str(soup)
|
|
324
|
+
|
|
325
|
+
def __wrap_marked_sections_in_code_blocks(self, html, start_marker, end_marker):
|
|
326
|
+
"""
|
|
327
|
+
Find content between markers and wrap appropriately:
|
|
328
|
+
- Simple text/code → <pre><code> block
|
|
329
|
+
- Tables/lists → Custom wrapper with preserved structure
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
html (str): The HTML content from Mammoth.
|
|
333
|
+
start_marker (str): The start marker text.
|
|
334
|
+
end_marker (str): The end marker text.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
str: HTML with marked sections wrapped appropriately.
|
|
338
|
+
"""
|
|
339
|
+
import html as html_module
|
|
340
|
+
|
|
341
|
+
# Mammoth escapes < and > to < and >, so we need to escape our markers too
|
|
342
|
+
escaped_start = html_module.escape(start_marker)
|
|
343
|
+
escaped_end = html_module.escape(end_marker)
|
|
344
|
+
|
|
345
|
+
# Pattern to find content between HTML-escaped markers (including HTML tags)
|
|
346
|
+
# The markers will be in separate <p> tags, and content in between
|
|
347
|
+
pattern = re.compile(
|
|
348
|
+
f'<p>{re.escape(escaped_start)}</p>(.*?)<p>{re.escape(escaped_end)}</p>',
|
|
349
|
+
re.DOTALL
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def replace_with_appropriate_wrapper(match):
|
|
353
|
+
content = match.group(1)
|
|
354
|
+
|
|
355
|
+
# Detect if content has complex structure (tables, lists)
|
|
356
|
+
has_complex_structure = self.__contains_complex_structure(content)
|
|
357
|
+
|
|
358
|
+
if has_complex_structure:
|
|
359
|
+
# Preserve structure: keep HTML as-is, escape # symbols
|
|
360
|
+
escaped_content = self.__escape_hash_symbols(content)
|
|
361
|
+
# Wrap in a div with special class for potential custom handling
|
|
362
|
+
return f'<div class="alita-bordered-content">{escaped_content}</div>'
|
|
363
|
+
else:
|
|
364
|
+
# Simple text/code: extract as plain text and wrap in code block
|
|
365
|
+
content_soup = BeautifulSoup(content, 'html.parser')
|
|
366
|
+
|
|
367
|
+
# Extract text from each paragraph separately to preserve line breaks
|
|
368
|
+
lines = []
|
|
369
|
+
for element in content_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
370
|
+
# Replace <br /> within paragraphs with newlines
|
|
371
|
+
for br in element.find_all('br'):
|
|
372
|
+
br.replace_with('\n')
|
|
373
|
+
text = element.get_text()
|
|
374
|
+
# Preserve leading whitespace (indentation), only strip trailing
|
|
375
|
+
lines.append(text.rstrip())
|
|
376
|
+
|
|
377
|
+
# If no paragraphs found, just get all text
|
|
378
|
+
if not lines:
|
|
379
|
+
content = content.replace('<br />', '\n').replace('<br/>', '\n').replace('<br>', '\n')
|
|
380
|
+
content_text = content_soup.get_text()
|
|
381
|
+
lines = [line.rstrip() for line in content_text.split('\n')]
|
|
382
|
+
|
|
383
|
+
# Join lines, strip only leading/trailing empty lines
|
|
384
|
+
content_text = '\n'.join(lines).strip()
|
|
385
|
+
# Return as code block (need to HTML-escape the content)
|
|
386
|
+
content_escaped = html_module.escape(content_text)
|
|
387
|
+
return f'<pre><code>{content_escaped}</code></pre>'
|
|
388
|
+
|
|
389
|
+
# Replace all marked sections with appropriate wrappers
|
|
390
|
+
result_html = pattern.sub(replace_with_appropriate_wrapper, html)
|
|
391
|
+
|
|
392
|
+
return result_html
|
|
393
|
+
|
|
100
394
|
def load(self):
|
|
101
395
|
"""
|
|
102
396
|
Loads and converts the Docx file to markdown format.
|
|
@@ -131,6 +425,7 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
131
425
|
def _convert_docx_to_markdown(self, docx_file):
|
|
132
426
|
"""
|
|
133
427
|
Converts the content of a Docx file to markdown format.
|
|
428
|
+
Detects bordered content and treats it as code blocks.
|
|
134
429
|
|
|
135
430
|
Args:
|
|
136
431
|
docx_file (BinaryIO): The Docx file object.
|
|
@@ -138,11 +433,28 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
138
433
|
Returns:
|
|
139
434
|
str: The markdown content extracted from the Docx file.
|
|
140
435
|
"""
|
|
436
|
+
# Step 1: Detect and mark bordered content
|
|
437
|
+
# Reset stream position if needed
|
|
438
|
+
if hasattr(docx_file, 'seek'):
|
|
439
|
+
docx_file.seek(0)
|
|
440
|
+
|
|
441
|
+
marked_docx, start_marker, end_marker = self.__detect_and_mark_bordered_content(docx_file)
|
|
442
|
+
|
|
443
|
+
# Step 2: Convert marked DOCX to HTML using Mammoth
|
|
141
444
|
if self.extract_images:
|
|
142
445
|
# Extract images using the provided image handler
|
|
143
|
-
result = convert_to_html(
|
|
446
|
+
result = convert_to_html(marked_docx, convert_image=mammoth.images.img_element(self.__handle_image))
|
|
144
447
|
else:
|
|
145
448
|
# Ignore images
|
|
146
|
-
result = convert_to_html(
|
|
147
|
-
|
|
449
|
+
result = convert_to_html(marked_docx, convert_image=lambda image: "")
|
|
450
|
+
|
|
451
|
+
# Step 3: Wrap marked sections in <pre><code> tags
|
|
452
|
+
html_with_code_blocks = self.__wrap_marked_sections_in_code_blocks(
|
|
453
|
+
result.value, start_marker, end_marker
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# Step 4: Convert HTML to markdown
|
|
457
|
+
content = markdownify(html_with_code_blocks, heading_style="ATX")
|
|
458
|
+
|
|
459
|
+
# Step 5: Post-process markdown (for image transcripts, etc.)
|
|
148
460
|
return self.__postprocess_original_md(content)
|
|
@@ -21,14 +21,16 @@ from openpyxl import load_workbook
|
|
|
21
21
|
from xlrd import open_workbook
|
|
22
22
|
from langchain_core.documents import Document
|
|
23
23
|
from .AlitaTableLoader import AlitaTableLoader
|
|
24
|
+
from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
|
|
24
25
|
|
|
25
26
|
cell_delimiter = " | "
|
|
26
27
|
|
|
27
28
|
class AlitaExcelLoader(AlitaTableLoader):
|
|
28
|
-
excel_by_sheets: bool = False
|
|
29
29
|
sheet_name: str = None
|
|
30
|
-
return_type: str = 'str'
|
|
31
30
|
file_name: str = None
|
|
31
|
+
max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
|
|
32
|
+
add_header_to_chunks: bool = False
|
|
33
|
+
header_row_number: int = 1
|
|
32
34
|
|
|
33
35
|
def __init__(self, **kwargs):
|
|
34
36
|
if not kwargs.get('file_path'):
|
|
@@ -39,9 +41,22 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
39
41
|
else:
|
|
40
42
|
self.file_name = kwargs.get('file_path')
|
|
41
43
|
super().__init__(**kwargs)
|
|
42
|
-
self.excel_by_sheets = kwargs.get('excel_by_sheets')
|
|
43
|
-
self.return_type = kwargs.get('return_type')
|
|
44
44
|
self.sheet_name = kwargs.get('sheet_name')
|
|
45
|
+
# Set and validate chunking parameters only once
|
|
46
|
+
self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
|
|
47
|
+
self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
|
|
48
|
+
header_row_number = kwargs.get('header_row_number', 1)
|
|
49
|
+
# Validate header_row_number
|
|
50
|
+
try:
|
|
51
|
+
header_row_number = int(header_row_number)
|
|
52
|
+
if header_row_number > 0:
|
|
53
|
+
self.header_row_number = header_row_number
|
|
54
|
+
else:
|
|
55
|
+
self.header_row_number = 1
|
|
56
|
+
self.add_header_to_chunks = False
|
|
57
|
+
except (ValueError, TypeError):
|
|
58
|
+
self.header_row_number = 1
|
|
59
|
+
self.add_header_to_chunks = False
|
|
45
60
|
|
|
46
61
|
def get_content(self):
|
|
47
62
|
try:
|
|
@@ -64,59 +79,32 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
64
79
|
Reads .xlsx files using openpyxl.
|
|
65
80
|
"""
|
|
66
81
|
workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
|
|
67
|
-
|
|
82
|
+
sheets = workbook.sheetnames
|
|
68
83
|
if self.sheet_name:
|
|
69
|
-
|
|
70
|
-
if self.sheet_name in workbook.sheetnames:
|
|
84
|
+
if self.sheet_name in sheets:
|
|
71
85
|
sheet_content = self.parse_sheet(workbook[self.sheet_name])
|
|
72
|
-
return sheet_content
|
|
73
86
|
else:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
# Parse each sheet individually and return as a dictionary
|
|
77
|
-
result = {}
|
|
78
|
-
for sheet_name in workbook.sheetnames:
|
|
79
|
-
sheet_content = self.parse_sheet(workbook[sheet_name])
|
|
80
|
-
result[sheet_name] = sheet_content
|
|
81
|
-
return result
|
|
87
|
+
sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
|
|
88
|
+
return {self.sheet_name: sheet_content}
|
|
82
89
|
else:
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
for sheet_name in workbook.sheetnames:
|
|
86
|
-
sheet_content = self.parse_sheet(workbook[sheet_name])
|
|
87
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
|
|
88
|
-
return "\n\n".join(result)
|
|
90
|
+
# Dictionary comprehension for all sheets
|
|
91
|
+
return {name: self.parse_sheet(workbook[name]) for name in sheets}
|
|
89
92
|
|
|
90
93
|
def _read_xls(self):
|
|
91
94
|
"""
|
|
92
95
|
Reads .xls files using xlrd.
|
|
93
96
|
"""
|
|
94
97
|
workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
|
|
95
|
-
|
|
98
|
+
sheets = workbook.sheet_names()
|
|
96
99
|
if self.sheet_name:
|
|
97
|
-
|
|
98
|
-
if self.sheet_name in workbook.sheet_names():
|
|
100
|
+
if self.sheet_name in sheets:
|
|
99
101
|
sheet = workbook.sheet_by_name(self.sheet_name)
|
|
100
|
-
|
|
101
|
-
return sheet_content
|
|
102
|
+
return {self.sheet_name: self.parse_sheet_xls(sheet)}
|
|
102
103
|
else:
|
|
103
|
-
|
|
104
|
-
elif self.excel_by_sheets:
|
|
105
|
-
# Parse each sheet individually and return as a dictionary
|
|
106
|
-
result = {}
|
|
107
|
-
for sheet_name in workbook.sheet_names():
|
|
108
|
-
sheet = workbook.sheet_by_name(sheet_name)
|
|
109
|
-
sheet_content = self.parse_sheet_xls(sheet)
|
|
110
|
-
result[sheet_name] = sheet_content
|
|
111
|
-
return result
|
|
104
|
+
return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
|
|
112
105
|
else:
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
for sheet_name in workbook.sheet_names():
|
|
116
|
-
sheet = workbook.sheet_by_name(sheet_name)
|
|
117
|
-
sheet_content = self.parse_sheet_xls(sheet)
|
|
118
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
|
|
119
|
-
return "\n\n".join(result)
|
|
106
|
+
# Dictionary comprehension for all sheets
|
|
107
|
+
return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
|
|
120
108
|
|
|
121
109
|
def parse_sheet(self, sheet):
|
|
122
110
|
"""
|
|
@@ -170,34 +158,89 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
170
158
|
# Format the sheet content based on the return type
|
|
171
159
|
return self._format_sheet_content(sheet_content)
|
|
172
160
|
|
|
173
|
-
def _format_sheet_content(self,
|
|
161
|
+
def _format_sheet_content(self, rows):
|
|
174
162
|
"""
|
|
175
|
-
|
|
163
|
+
Specification:
|
|
164
|
+
Formats a list of sheet rows into a list of string chunks according to the following rules:
|
|
165
|
+
1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
|
|
166
|
+
- If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
|
|
167
|
+
2. If max_tokens >= 1:
|
|
168
|
+
a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
|
|
169
|
+
b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
|
|
170
|
+
c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
|
|
171
|
+
3. Returns: List[str], where each string is a chunk ready for further processing.
|
|
176
172
|
"""
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
173
|
+
import tiktoken
|
|
174
|
+
encoding = tiktoken.get_encoding('cl100k_base')
|
|
175
|
+
|
|
176
|
+
# --- Inner functions ---
|
|
177
|
+
def count_tokens(text):
|
|
178
|
+
"""Count tokens in text using tiktoken encoding."""
|
|
179
|
+
return len(encoding.encode(text))
|
|
180
|
+
|
|
181
|
+
def finalize_chunk(chunk_rows):
|
|
182
|
+
"""Join rows for a chunk, prepending header if needed."""
|
|
183
|
+
if self.add_header_to_chunks and header:
|
|
184
|
+
return '\n'.join([header] + chunk_rows)
|
|
185
|
+
else:
|
|
186
|
+
return '\n'.join(chunk_rows)
|
|
187
|
+
# --- End inner functions ---
|
|
188
|
+
|
|
189
|
+
# If max_tokens < 1, return all rows as a single chunk
|
|
190
|
+
if self.max_tokens < 1:
|
|
191
|
+
return ['\n'.join(rows)]
|
|
192
|
+
|
|
193
|
+
# Extract header if needed
|
|
194
|
+
header = None
|
|
195
|
+
if self.add_header_to_chunks and rows:
|
|
196
|
+
header_idx = self.header_row_number - 1
|
|
197
|
+
header = rows.pop(header_idx)
|
|
198
|
+
|
|
199
|
+
chunks = [] # List to store final chunks
|
|
200
|
+
current_chunk = [] # Accumulate rows for the current chunk
|
|
201
|
+
current_tokens = 0 # Token count for the current chunk
|
|
202
|
+
|
|
203
|
+
for row in rows:
|
|
204
|
+
row_tokens = count_tokens(row)
|
|
205
|
+
# If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
|
|
206
|
+
if row_tokens > self.max_tokens:
|
|
207
|
+
if current_chunk:
|
|
208
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
209
|
+
current_chunk = []
|
|
210
|
+
current_tokens = 0
|
|
211
|
+
# Add the large row as its own chunk, with header if needed
|
|
212
|
+
if self.add_header_to_chunks and header:
|
|
213
|
+
chunks.append(finalize_chunk([row]))
|
|
214
|
+
else:
|
|
215
|
+
chunks.append(row)
|
|
216
|
+
continue
|
|
217
|
+
# If adding row would exceed max_tokens, flush current chunk and start new
|
|
218
|
+
if current_tokens + row_tokens > self.max_tokens:
|
|
219
|
+
if current_chunk:
|
|
220
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
221
|
+
current_chunk = [row]
|
|
222
|
+
current_tokens = row_tokens
|
|
223
|
+
else:
|
|
224
|
+
current_chunk.append(row)
|
|
225
|
+
current_tokens += row_tokens
|
|
226
|
+
# Add any remaining rows as the last chunk
|
|
227
|
+
if current_chunk:
|
|
228
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
229
|
+
return chunks
|
|
188
230
|
|
|
189
231
|
def load(self) -> list:
|
|
190
232
|
docs = []
|
|
191
233
|
content_per_sheet = self.get_content()
|
|
192
|
-
|
|
234
|
+
# content_per_sheet is a dict of sheet_name: list of chunk strings
|
|
235
|
+
for sheet_name, content_chunks in content_per_sheet.items():
|
|
193
236
|
metadata = {
|
|
194
237
|
"source": f'{self.file_path}:{sheet_name}',
|
|
195
238
|
"sheet_name": sheet_name,
|
|
196
239
|
"file_type": "excel",
|
|
197
|
-
"excel_by_sheets": self.excel_by_sheets,
|
|
198
|
-
"return_type": self.return_type,
|
|
199
240
|
}
|
|
200
|
-
|
|
241
|
+
# Each chunk is a separate Document
|
|
242
|
+
for chunk in content_chunks:
|
|
243
|
+
docs.append(Document(page_content=chunk, metadata=metadata))
|
|
201
244
|
return docs
|
|
202
245
|
|
|
203
246
|
def read(self, lazy: bool = False):
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from .AlitaJSONLoader import AlitaJSONLoader
|
|
2
|
+
import json
|
|
3
|
+
from io import StringIO
|
|
4
|
+
from typing import List, Iterator
|
|
5
|
+
|
|
6
|
+
from langchain_core.documents import Document
|
|
7
|
+
from langchain_core.tools import ToolException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AlitaJSONLinesLoader(AlitaJSONLoader):
|
|
11
|
+
"""Load local JSONL files (one JSON object per line) using AlitaJSONLoader behavior.
|
|
12
|
+
|
|
13
|
+
Behavior:
|
|
14
|
+
- Supports both `file_path` and `file_content` (bytes or file-like object), same as AlitaJSONLoader.
|
|
15
|
+
- Treats each non-empty line as an independent JSON object.
|
|
16
|
+
- Aggregates all parsed JSON objects into a list and feeds them through the same
|
|
17
|
+
RecursiveJsonSplitter-based chunking used by AlitaJSONLoader.lazy_load.
|
|
18
|
+
- Returns a list of Documents with chunked JSON content.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
# Reuse AlitaJSONLoader initialization logic (file_path / file_content handling, encoding, etc.)
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
|
|
25
|
+
def _iter_lines(self) -> Iterator[str]:
|
|
26
|
+
"""Yield lines from file_path or file_content, mirroring AlitaJSONLoader sources."""
|
|
27
|
+
# Prefer file_path if available
|
|
28
|
+
if hasattr(self, "file_path") and self.file_path:
|
|
29
|
+
with open(self.file_path, "r", encoding=self.encoding) as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
yield line
|
|
32
|
+
# Fallback to file_content if available
|
|
33
|
+
elif hasattr(self, "file_content") and self.file_content:
|
|
34
|
+
# file_content may be bytes or a file-like object
|
|
35
|
+
if isinstance(self.file_content, (bytes, bytearray)):
|
|
36
|
+
text = self.file_content.decode(self.encoding)
|
|
37
|
+
for line in StringIO(text):
|
|
38
|
+
yield line
|
|
39
|
+
else:
|
|
40
|
+
# Assume it's a text file-like object positioned at the beginning
|
|
41
|
+
self.file_content.seek(0)
|
|
42
|
+
for line in self.file_content:
|
|
43
|
+
yield line
|
|
44
|
+
else:
|
|
45
|
+
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
|
46
|
+
|
|
47
|
+
def load(self) -> List[Document]: # type: ignore[override]
|
|
48
|
+
"""Load JSONL content by delegating each non-empty line to AlitaJSONLoader.
|
|
49
|
+
|
|
50
|
+
For each non-empty line in the underlying source (file_path or file_content):
|
|
51
|
+
- Create a temporary AlitaJSONLoader instance with that line as file_content.
|
|
52
|
+
- Call lazy_load() on that instance to apply the same RecursiveJsonSplitter logic
|
|
53
|
+
as for a normal JSON file.
|
|
54
|
+
- Accumulate all Documents from all lines and return them as a single list.
|
|
55
|
+
"""
|
|
56
|
+
docs: List[Document] = []
|
|
57
|
+
|
|
58
|
+
for raw_line in self._iter_lines():
|
|
59
|
+
line = raw_line.strip()
|
|
60
|
+
if not line:
|
|
61
|
+
continue
|
|
62
|
+
try:
|
|
63
|
+
# Instantiate a per-line AlitaJSONLoader using the same configuration
|
|
64
|
+
line_loader = AlitaJSONLoader(
|
|
65
|
+
file_content=line,
|
|
66
|
+
file_name=getattr(self, "file_name", str(getattr(self, "file_path", "no_name"))),
|
|
67
|
+
encoding=self.encoding,
|
|
68
|
+
autodetect_encoding=self.autodetect_encoding,
|
|
69
|
+
max_tokens=self.max_tokens,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for doc in line_loader.lazy_load():
|
|
73
|
+
docs.append(doc)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise ToolException(f"Error processing JSONL line: {line[:100]}... Error: {e}") from e
|
|
76
|
+
|
|
77
|
+
return docs
|