alita-sdk 0.3.374__py3-none-any.whl → 0.3.423__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/configurations/bitbucket.py +95 -0
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/client.py +3 -2
- alita_sdk/runtime/clients/sandbox_client.py +8 -0
- alita_sdk/runtime/langchain/assistant.py +56 -40
- alita_sdk/runtime/langchain/constants.py +4 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
- alita_sdk/runtime/langchain/document_loaders/constants.py +28 -12
- alita_sdk/runtime/langchain/langraph_agent.py +92 -28
- alita_sdk/runtime/langchain/utils.py +24 -4
- alita_sdk/runtime/toolkits/application.py +8 -1
- alita_sdk/runtime/toolkits/tools.py +80 -49
- alita_sdk/runtime/tools/__init__.py +7 -2
- alita_sdk/runtime/tools/application.py +7 -0
- alita_sdk/runtime/tools/function.py +28 -23
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +104 -8
- alita_sdk/runtime/tools/llm.py +146 -114
- alita_sdk/runtime/tools/sandbox.py +166 -63
- alita_sdk/runtime/tools/vectorstore.py +22 -21
- alita_sdk/runtime/tools/vectorstore_base.py +16 -15
- alita_sdk/runtime/utils/utils.py +1 -0
- alita_sdk/tools/__init__.py +43 -31
- alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
- alita_sdk/tools/base_indexer_toolkit.py +102 -93
- alita_sdk/tools/code_indexer_toolkit.py +15 -5
- alita_sdk/tools/confluence/api_wrapper.py +30 -8
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/elitea_base.py +22 -22
- alita_sdk/tools/gitlab/api_wrapper.py +8 -9
- alita_sdk/tools/jira/api_wrapper.py +1 -1
- alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/qtest/api_wrapper.py +298 -51
- alita_sdk/tools/sharepoint/api_wrapper.py +104 -33
- alita_sdk/tools/sharepoint/authorization_helper.py +175 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/utils/content_parser.py +27 -16
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +38 -25
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/RECORD +51 -51
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/top_level.txt +0 -0
|
@@ -18,3 +18,78 @@ class ZephyrEssentialConfiguration(BaseModel):
|
|
|
18
18
|
)
|
|
19
19
|
base_url: Optional[str] = Field(description="Zephyr Essential API Base URL", default=None)
|
|
20
20
|
token: SecretStr = Field(description="Zephyr Essential API Token")
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def check_connection(settings: dict) -> str | None:
|
|
24
|
+
"""
|
|
25
|
+
Check the connection to Zephyr Essential (Zephyr Scale).
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
settings: Dictionary containing Zephyr Essential configuration
|
|
29
|
+
- base_url: Zephyr Essential API Base URL (optional, defaults to Zephyr Scale Cloud API)
|
|
30
|
+
- token: Zephyr Essential API Token (required)
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
None if connection successful, error message string if failed
|
|
34
|
+
"""
|
|
35
|
+
import requests
|
|
36
|
+
|
|
37
|
+
# Get base_url or use default
|
|
38
|
+
base_url = settings.get("base_url")
|
|
39
|
+
if base_url:
|
|
40
|
+
base_url = base_url.strip().rstrip("/")
|
|
41
|
+
# Validate URL format if provided
|
|
42
|
+
if not base_url.startswith(("http://", "https://")):
|
|
43
|
+
return "Zephyr Essential URL must start with http:// or https://"
|
|
44
|
+
else:
|
|
45
|
+
# Default to Zephyr Scale Cloud API
|
|
46
|
+
base_url = "https://api.zephyrscale.smartbear.com/v2"
|
|
47
|
+
|
|
48
|
+
# Validate token
|
|
49
|
+
token = settings.get("token")
|
|
50
|
+
if not token:
|
|
51
|
+
return "Zephyr Essential API token is required"
|
|
52
|
+
|
|
53
|
+
# Extract token value if it's a SecretStr
|
|
54
|
+
token_value = token.get_secret_value() if hasattr(token, 'get_secret_value') else token
|
|
55
|
+
|
|
56
|
+
if not token_value or not str(token_value).strip():
|
|
57
|
+
return "Zephyr Essential API token cannot be empty"
|
|
58
|
+
|
|
59
|
+
# Test connection using /projects endpoint (requires authentication)
|
|
60
|
+
test_url = f"{base_url}/projects"
|
|
61
|
+
|
|
62
|
+
headers = {
|
|
63
|
+
"Authorization": f"Bearer {str(token_value).strip()}"
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
response = requests.get(
|
|
68
|
+
test_url,
|
|
69
|
+
headers=headers,
|
|
70
|
+
timeout=10
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Check response status
|
|
74
|
+
if response.status_code == 200:
|
|
75
|
+
# Successfully connected and authenticated
|
|
76
|
+
return None
|
|
77
|
+
elif response.status_code == 401:
|
|
78
|
+
return "Authentication failed: invalid API token"
|
|
79
|
+
elif response.status_code == 403:
|
|
80
|
+
return "Access forbidden: token lacks required permissions"
|
|
81
|
+
elif response.status_code == 404:
|
|
82
|
+
return "Zephyr Essential API endpoint not found: verify the API URL"
|
|
83
|
+
else:
|
|
84
|
+
return f"Zephyr Essential API returned status code {response.status_code}"
|
|
85
|
+
|
|
86
|
+
except requests.exceptions.SSLError as e:
|
|
87
|
+
return f"SSL certificate verification failed: {str(e)}"
|
|
88
|
+
except requests.exceptions.ConnectionError:
|
|
89
|
+
return f"Cannot connect to Zephyr Essential at {base_url}: connection refused"
|
|
90
|
+
except requests.exceptions.Timeout:
|
|
91
|
+
return f"Connection to Zephyr Essential at {base_url} timed out"
|
|
92
|
+
except requests.exceptions.RequestException as e:
|
|
93
|
+
return f"Error connecting to Zephyr Essential: {str(e)}"
|
|
94
|
+
except Exception as e:
|
|
95
|
+
return f"Unexpected error: {str(e)}"
|
|
@@ -568,7 +568,7 @@ class AlitaClient:
|
|
|
568
568
|
def predict_agent(self, llm: ChatOpenAI, instructions: str = "You are a helpful assistant.",
|
|
569
569
|
tools: Optional[list] = None, chat_history: Optional[List[Any]] = None,
|
|
570
570
|
memory=None, runtime='langchain', variables: Optional[list] = None,
|
|
571
|
-
store: Optional[BaseStore] = None):
|
|
571
|
+
store: Optional[BaseStore] = None, debug_mode: Optional[bool] = False):
|
|
572
572
|
"""
|
|
573
573
|
Create a predict-type agent with minimal configuration.
|
|
574
574
|
|
|
@@ -581,6 +581,7 @@ class AlitaClient:
|
|
|
581
581
|
runtime: Runtime type (default: 'langchain')
|
|
582
582
|
variables: Optional list of variables for the agent
|
|
583
583
|
store: Optional store for memory
|
|
584
|
+
debug_mode: Enable debug mode for cases when assistant can be initialized without tools
|
|
584
585
|
|
|
585
586
|
Returns:
|
|
586
587
|
Runnable agent ready for execution
|
|
@@ -600,7 +601,7 @@ class AlitaClient:
|
|
|
600
601
|
'variables': variables
|
|
601
602
|
}
|
|
602
603
|
return LangChainAssistant(self, agent_data, llm,
|
|
603
|
-
chat_history, "predict", memory=memory, store=store).runnable()
|
|
604
|
+
chat_history, "predict", memory=memory, store=store, debug_mode=debug_mode).runnable()
|
|
604
605
|
|
|
605
606
|
def test_toolkit_tool(self, toolkit_config: dict, tool_name: str, tool_params: dict = None,
|
|
606
607
|
runtime_config: dict = None, llm_model: str = None,
|
|
@@ -143,6 +143,7 @@ class SandboxClient:
|
|
|
143
143
|
self.configurations_url = f'{self.base_url}{self.api_path}/integrations/integrations/default/{self.project_id}?section=configurations&unsecret=true'
|
|
144
144
|
self.ai_section_url = f'{self.base_url}{self.api_path}/integrations/integrations/default/{self.project_id}?section=ai'
|
|
145
145
|
self.image_generation_url = f'{self.base_url}{self.llm_path}/images/generations'
|
|
146
|
+
self.auth_user_url = f'{self.base_url}{self.api_path}/auth/user'
|
|
146
147
|
self.configurations: list = configurations or []
|
|
147
148
|
self.model_timeout = kwargs.get('model_timeout', 120)
|
|
148
149
|
self.model_image_generation = kwargs.get('model_image_generation')
|
|
@@ -363,3 +364,10 @@ class SandboxClient:
|
|
|
363
364
|
url = f'{self.artifact_url}/{bucket_name}'
|
|
364
365
|
data = requests.delete(url, headers=self.headers, verify=False, params={'filename': quote(artifact_name)})
|
|
365
366
|
return self._process_requst(data)
|
|
367
|
+
|
|
368
|
+
def get_user_data(self) -> Dict[str, Any]:
|
|
369
|
+
resp = requests.get(self.auth_user_url, headers=self.headers, verify=False)
|
|
370
|
+
if resp.ok:
|
|
371
|
+
return resp.json()
|
|
372
|
+
logger.error(f'Failed to fetch user data: {resp.status_code} - {resp.text}')
|
|
373
|
+
raise ApiDetailsRequestError(f'Failed to fetch user data with status code {resp.status_code}.')
|
|
@@ -17,6 +17,7 @@ from .constants import REACT_ADDON, REACT_VARS, XML_ADDON
|
|
|
17
17
|
from .chat_message_template import Jinja2TemplatedChatMessagesTemplate
|
|
18
18
|
from ..tools.echo import EchoTool
|
|
19
19
|
from langchain_core.tools import BaseTool, ToolException
|
|
20
|
+
from jinja2 import Environment, DebugUndefined
|
|
20
21
|
|
|
21
22
|
logger = logging.getLogger(__name__)
|
|
22
23
|
|
|
@@ -29,7 +30,8 @@ class Assistant:
|
|
|
29
30
|
app_type: str = "openai",
|
|
30
31
|
tools: Optional[list] = [],
|
|
31
32
|
memory: Optional[Any] = None,
|
|
32
|
-
store: Optional[BaseStore] = None
|
|
33
|
+
store: Optional[BaseStore] = None,
|
|
34
|
+
debug_mode: Optional[bool] = False):
|
|
33
35
|
|
|
34
36
|
self.app_type = app_type
|
|
35
37
|
self.memory = memory
|
|
@@ -77,11 +79,17 @@ class Assistant:
|
|
|
77
79
|
else:
|
|
78
80
|
# For predict agents, initialize memory store to None since they don't use memory
|
|
79
81
|
self.store = None
|
|
80
|
-
|
|
82
|
+
|
|
81
83
|
# Lazy import to avoid circular dependency
|
|
82
84
|
from ..toolkits.tools import get_tools
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
version_tools = data['tools']
|
|
86
|
+
# Handle internal tools
|
|
87
|
+
meta = data.get('meta', {})
|
|
88
|
+
if meta.get("internal_tools"):
|
|
89
|
+
for internal_tool_name in meta.get("internal_tools"):
|
|
90
|
+
version_tools.append({"type": "internal_tool", "name": internal_tool_name})
|
|
91
|
+
|
|
92
|
+
self.tools = get_tools(version_tools, alita_client=alita, llm=self.client, memory_store=self.store, debug_mode=debug_mode)
|
|
85
93
|
if tools:
|
|
86
94
|
self.tools += tools
|
|
87
95
|
# Handle prompt setup
|
|
@@ -118,9 +126,11 @@ class Assistant:
|
|
|
118
126
|
if variables:
|
|
119
127
|
self.prompt.partial_variables = variables
|
|
120
128
|
try:
|
|
121
|
-
logger.info(
|
|
129
|
+
logger.info(
|
|
130
|
+
f"Client was created with client setting: temperature - {self.client._get_model_default_parameters}")
|
|
122
131
|
except Exception as e:
|
|
123
|
-
logger.info(
|
|
132
|
+
logger.info(
|
|
133
|
+
f"Client was created with client setting: temperature - {self.client.temperature} : {self.client.max_tokens}")
|
|
124
134
|
|
|
125
135
|
def _configure_store(self, memory_tool: dict | None) -> None:
|
|
126
136
|
"""
|
|
@@ -137,11 +147,9 @@ class Assistant:
|
|
|
137
147
|
def runnable(self):
|
|
138
148
|
if self.app_type == 'pipeline':
|
|
139
149
|
return self.pipeline()
|
|
140
|
-
elif self.app_type == 'openai':
|
|
141
|
-
return self.getOpenAIToolsAgentExecutor()
|
|
142
150
|
elif self.app_type == 'xml':
|
|
143
151
|
return self.getXMLAgentExecutor()
|
|
144
|
-
elif self.app_type in ['predict', 'react']:
|
|
152
|
+
elif self.app_type in ['predict', 'react', 'openai']:
|
|
145
153
|
return self.getLangGraphReactAgent()
|
|
146
154
|
else:
|
|
147
155
|
self.tools = [EchoTool()] + self.tools
|
|
@@ -159,7 +167,6 @@ class Assistant:
|
|
|
159
167
|
agent = create_json_chat_agent(llm=self.client, tools=simple_tools, prompt=self.prompt)
|
|
160
168
|
return self._agent_executor(agent)
|
|
161
169
|
|
|
162
|
-
|
|
163
170
|
def getXMLAgentExecutor(self):
|
|
164
171
|
# Exclude compiled graph runnables from simple tool agents
|
|
165
172
|
simple_tools = [t for t in self.tools if isinstance(t, (BaseTool, CompiledStateGraph))]
|
|
@@ -180,34 +187,6 @@ class Assistant:
|
|
|
180
187
|
# Exclude compiled graph runnables from simple tool agents
|
|
181
188
|
simple_tools = [t for t in self.tools if isinstance(t, (BaseTool, CompiledStateGraph))]
|
|
182
189
|
|
|
183
|
-
# Add sandbox tool by default for react agents
|
|
184
|
-
try:
|
|
185
|
-
from ..tools.sandbox import create_sandbox_tool
|
|
186
|
-
sandbox_tool = create_sandbox_tool(stateful=False, allow_net=True)
|
|
187
|
-
simple_tools.append(sandbox_tool)
|
|
188
|
-
logger.info("Added PyodideSandboxTool to react agent")
|
|
189
|
-
except ImportError as e:
|
|
190
|
-
logger.warning(f"Failed to add PyodideSandboxTool: {e}. Install langchain-sandbox to enable this feature.")
|
|
191
|
-
except RuntimeError as e:
|
|
192
|
-
if "Deno" in str(e):
|
|
193
|
-
logger.warning("Failed to add PyodideSandboxTool: Deno is required. Install from https://docs.deno.com/runtime/getting_started/installation/")
|
|
194
|
-
else:
|
|
195
|
-
logger.warning(f"Failed to add PyodideSandboxTool: {e}")
|
|
196
|
-
except Exception as e:
|
|
197
|
-
logger.error(f"Error adding PyodideSandboxTool: {e}")
|
|
198
|
-
|
|
199
|
-
# Add image generation tool if model is configured
|
|
200
|
-
if self.alita_client.model_image_generation is not None:
|
|
201
|
-
try:
|
|
202
|
-
from ..tools.image_generation import (
|
|
203
|
-
create_image_generation_tool
|
|
204
|
-
)
|
|
205
|
-
image_tool = create_image_generation_tool(self.alita_client)
|
|
206
|
-
simple_tools.append(image_tool)
|
|
207
|
-
logger.info("Added ImageGenerationTool to react agent")
|
|
208
|
-
except Exception as e:
|
|
209
|
-
logger.error(f"Error adding ImageGenerationTool: {e}")
|
|
210
|
-
|
|
211
190
|
# Set up memory/checkpointer if available
|
|
212
191
|
checkpointer = None
|
|
213
192
|
if self.memory is not None:
|
|
@@ -241,6 +220,10 @@ class Assistant:
|
|
|
241
220
|
# Only use prompt_instructions if explicitly specified (for predict app_type)
|
|
242
221
|
if self.app_type == "predict" and isinstance(self.prompt, str):
|
|
243
222
|
prompt_instructions = self.prompt
|
|
223
|
+
|
|
224
|
+
# take the system message from the openai prompt as a prompt instructions
|
|
225
|
+
if self.app_type == "openai" and hasattr(self.prompt, 'messages'):
|
|
226
|
+
prompt_instructions = self.__take_prompt_from_openai_messages()
|
|
244
227
|
|
|
245
228
|
# Create a unified YAML schema with conditional tool binding
|
|
246
229
|
# Build the base node configuration
|
|
@@ -282,6 +265,9 @@ class Assistant:
|
|
|
282
265
|
schema_dict = {
|
|
283
266
|
'name': 'react_agent',
|
|
284
267
|
'state': {
|
|
268
|
+
'input': {
|
|
269
|
+
'type': 'str'
|
|
270
|
+
},
|
|
285
271
|
'messages': state_messages_config
|
|
286
272
|
},
|
|
287
273
|
'nodes': [{
|
|
@@ -290,6 +276,21 @@ class Assistant:
|
|
|
290
276
|
'prompt': {
|
|
291
277
|
'template': escaped_prompt
|
|
292
278
|
},
|
|
279
|
+
'input_mapping': {
|
|
280
|
+
'system': {
|
|
281
|
+
'type': 'fixed',
|
|
282
|
+
'value': escaped_prompt
|
|
283
|
+
},
|
|
284
|
+
'task': {
|
|
285
|
+
'type': 'variable',
|
|
286
|
+
'value': 'input'
|
|
287
|
+
},
|
|
288
|
+
'chat_history': {
|
|
289
|
+
'type': 'variable',
|
|
290
|
+
'value': 'messages'
|
|
291
|
+
}
|
|
292
|
+
},
|
|
293
|
+
'step_limit': self.max_iterations,
|
|
293
294
|
'input': ['messages'],
|
|
294
295
|
'output': ['messages'],
|
|
295
296
|
'transition': 'END'
|
|
@@ -315,7 +316,8 @@ class Assistant:
|
|
|
315
316
|
store=self.store,
|
|
316
317
|
debug=False,
|
|
317
318
|
for_subgraph=False,
|
|
318
|
-
alita_client=self.alita_client
|
|
319
|
+
alita_client=self.alita_client,
|
|
320
|
+
steps_limit=self.max_iterations
|
|
319
321
|
)
|
|
320
322
|
|
|
321
323
|
return agent
|
|
@@ -330,7 +332,8 @@ class Assistant:
|
|
|
330
332
|
agent = create_graph(
|
|
331
333
|
client=self.client, tools=self.tools,
|
|
332
334
|
yaml_schema=self.prompt, memory=memory,
|
|
333
|
-
alita_client=self.alita_client
|
|
335
|
+
alita_client=self.alita_client,
|
|
336
|
+
steps_limit=self.max_iterations
|
|
334
337
|
)
|
|
335
338
|
#
|
|
336
339
|
return agent
|
|
@@ -341,3 +344,16 @@ class Assistant:
|
|
|
341
344
|
|
|
342
345
|
def predict(self, messages: list[BaseMessage]):
|
|
343
346
|
return self.client.invoke(messages)
|
|
347
|
+
|
|
348
|
+
def __take_prompt_from_openai_messages(self):
|
|
349
|
+
if self.prompt and self.prompt.messages:
|
|
350
|
+
for message in self.prompt.messages:
|
|
351
|
+
# we don't need any message placeholder from the openai agent prompt
|
|
352
|
+
if hasattr(message, 'variable_name'):
|
|
353
|
+
continue
|
|
354
|
+
# take only the content of the system message from the openai prompt
|
|
355
|
+
if isinstance(message, SystemMessage):
|
|
356
|
+
environment = Environment(undefined=DebugUndefined)
|
|
357
|
+
template = environment.from_string(message.content)
|
|
358
|
+
return template.render(self.prompt.partial_variables)
|
|
359
|
+
return None
|
|
@@ -80,3 +80,7 @@ DEFAULT_MULTIMODAL_PROMPT = """
|
|
|
80
80
|
- Maintain a structured and logical flow in the output to enhance understanding and usability.
|
|
81
81
|
- Avoid presenting the entire prompt for user.
|
|
82
82
|
"""
|
|
83
|
+
|
|
84
|
+
ELITEA_RS = "elitea_response"
|
|
85
|
+
PRINTER = "printer"
|
|
86
|
+
PRINTER_NODE_RS = "printer_output"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
+
import uuid
|
|
2
3
|
from io import BytesIO
|
|
3
4
|
|
|
4
5
|
import mammoth.images
|
|
@@ -8,6 +9,9 @@ from langchain_core.document_loaders import BaseLoader
|
|
|
8
9
|
from langchain_core.documents import Document
|
|
9
10
|
from mammoth import convert_to_html
|
|
10
11
|
from markdownify import markdownify
|
|
12
|
+
from docx import Document as DocxDocument
|
|
13
|
+
from docx.oxml.ns import qn
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
11
15
|
|
|
12
16
|
from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_by_headers_chunker
|
|
13
17
|
from .utils import perform_llm_prediction_for_image_bytes
|
|
@@ -17,6 +21,7 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
17
21
|
"""
|
|
18
22
|
Loader for Docx files using Mammoth to convert to HTML, with image handling,
|
|
19
23
|
and then Markdownify to convert HTML to markdown.
|
|
24
|
+
Detects bordered paragraphs and text boxes and treats them as code blocks.
|
|
20
25
|
"""
|
|
21
26
|
def __init__(self, **kwargs):
|
|
22
27
|
"""
|
|
@@ -97,6 +102,295 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
97
102
|
new_md = pattern.sub(replace_placeholder, original_md)
|
|
98
103
|
return new_md
|
|
99
104
|
|
|
105
|
+
def __has_border(self, paragraph):
|
|
106
|
+
"""
|
|
107
|
+
Check if a paragraph has border formatting.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
paragraph: A python-docx Paragraph object.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
bool: True if paragraph has any border, False otherwise.
|
|
114
|
+
"""
|
|
115
|
+
pPr = paragraph._element.pPr
|
|
116
|
+
if pPr is not None:
|
|
117
|
+
pBdr = pPr.find(qn('w:pBdr'))
|
|
118
|
+
if pBdr is not None:
|
|
119
|
+
# Check if any border side exists (top, bottom, left, right)
|
|
120
|
+
for side in ['top', 'bottom', 'left', 'right']:
|
|
121
|
+
border = pBdr.find(qn(f'w:{side}'))
|
|
122
|
+
if border is not None:
|
|
123
|
+
# Check if border is not "none" or has a width
|
|
124
|
+
val = border.get(qn('w:val'))
|
|
125
|
+
if val and val != 'none':
|
|
126
|
+
return True
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
def __find_text_boxes(self, doc):
|
|
130
|
+
"""
|
|
131
|
+
Find all text boxes in document by searching OOXML structure.
|
|
132
|
+
Text boxes are typically in w:txbxContent elements.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
doc: A python-docx Document object.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
list: List of tuples (element, paragraphs_inside_textbox).
|
|
139
|
+
"""
|
|
140
|
+
text_boxes = []
|
|
141
|
+
|
|
142
|
+
# Iterate through document body XML to find text box content elements
|
|
143
|
+
for element in doc.element.body.iter():
|
|
144
|
+
# Look for text box content elements
|
|
145
|
+
if element.tag.endswith('txbxContent'):
|
|
146
|
+
# Collect all paragraphs inside this text box
|
|
147
|
+
txbx_paragraphs = []
|
|
148
|
+
for txbx_para_element in element.iter():
|
|
149
|
+
if txbx_para_element.tag.endswith('p'):
|
|
150
|
+
txbx_paragraphs.append(txbx_para_element)
|
|
151
|
+
|
|
152
|
+
if txbx_paragraphs:
|
|
153
|
+
text_boxes.append((element, txbx_paragraphs))
|
|
154
|
+
|
|
155
|
+
return text_boxes
|
|
156
|
+
|
|
157
|
+
def __create_marker_paragraph(self, marker_text):
|
|
158
|
+
"""
|
|
159
|
+
Create a paragraph element with marker text.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
marker_text (str): The marker text to insert.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Element: An OOXML paragraph element.
|
|
166
|
+
"""
|
|
167
|
+
from docx.oxml import OxmlElement
|
|
168
|
+
|
|
169
|
+
p = OxmlElement('w:p')
|
|
170
|
+
r = OxmlElement('w:r')
|
|
171
|
+
t = OxmlElement('w:t')
|
|
172
|
+
t.text = marker_text
|
|
173
|
+
r.append(t)
|
|
174
|
+
p.append(r)
|
|
175
|
+
return p
|
|
176
|
+
|
|
177
|
+
def __inject_markers_for_paragraph(self, paragraph, start_marker, end_marker):
|
|
178
|
+
"""
|
|
179
|
+
Inject marker paragraphs before and after a bordered paragraph.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
paragraph: A python-docx Paragraph object.
|
|
183
|
+
start_marker (str): The start marker text.
|
|
184
|
+
end_marker (str): The end marker text.
|
|
185
|
+
"""
|
|
186
|
+
# Insert start marker paragraph before
|
|
187
|
+
marker_p_start = self.__create_marker_paragraph(start_marker)
|
|
188
|
+
paragraph._element.addprevious(marker_p_start)
|
|
189
|
+
|
|
190
|
+
# Insert end marker paragraph after
|
|
191
|
+
marker_p_end = self.__create_marker_paragraph(end_marker)
|
|
192
|
+
paragraph._element.addnext(marker_p_end)
|
|
193
|
+
|
|
194
|
+
def __inject_markers_for_textbox(self, textbox_element, paragraph_elements, start_marker, end_marker):
|
|
195
|
+
"""
|
|
196
|
+
Inject markers around text box content.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
textbox_element: The w:txbxContent element.
|
|
200
|
+
paragraph_elements: List of paragraph elements inside the text box.
|
|
201
|
+
start_marker (str): The start marker text.
|
|
202
|
+
end_marker (str): The end marker text.
|
|
203
|
+
"""
|
|
204
|
+
if not paragraph_elements:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
# Insert start marker before first paragraph in text box
|
|
208
|
+
first_para = paragraph_elements[0]
|
|
209
|
+
marker_p_start = self.__create_marker_paragraph(start_marker)
|
|
210
|
+
first_para.addprevious(marker_p_start)
|
|
211
|
+
|
|
212
|
+
# Insert end marker after last paragraph in text box
|
|
213
|
+
last_para = paragraph_elements[-1]
|
|
214
|
+
marker_p_end = self.__create_marker_paragraph(end_marker)
|
|
215
|
+
last_para.addnext(marker_p_end)
|
|
216
|
+
|
|
217
|
+
def __detect_and_mark_bordered_content(self, docx_stream):
|
|
218
|
+
"""
|
|
219
|
+
Detects bordered paragraphs and text boxes, injects unique markers around them.
|
|
220
|
+
Groups consecutive bordered paragraphs into single code blocks.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
docx_stream: A file-like object containing the DOCX document.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
tuple: (modified_docx_stream, start_marker, end_marker)
|
|
227
|
+
"""
|
|
228
|
+
# Load document with python-docx
|
|
229
|
+
doc = DocxDocument(docx_stream)
|
|
230
|
+
|
|
231
|
+
# Generate unique markers to avoid conflicts with document content
|
|
232
|
+
unique_id = uuid.uuid4().hex[:8]
|
|
233
|
+
start_marker = f"<<<BORDERED_BLOCK_START_{unique_id}>>>"
|
|
234
|
+
end_marker = f"<<<BORDERED_BLOCK_END_{unique_id}>>>"
|
|
235
|
+
|
|
236
|
+
# Group consecutive bordered paragraphs together
|
|
237
|
+
bordered_groups = []
|
|
238
|
+
current_group = []
|
|
239
|
+
|
|
240
|
+
for para in doc.paragraphs:
|
|
241
|
+
if self.__has_border(para):
|
|
242
|
+
current_group.append(para)
|
|
243
|
+
else:
|
|
244
|
+
if current_group:
|
|
245
|
+
# End of a bordered group
|
|
246
|
+
bordered_groups.append(current_group)
|
|
247
|
+
current_group = []
|
|
248
|
+
|
|
249
|
+
# Don't forget the last group if document ends with bordered paragraphs
|
|
250
|
+
if current_group:
|
|
251
|
+
bordered_groups.append(current_group)
|
|
252
|
+
|
|
253
|
+
# Collect all text boxes
|
|
254
|
+
# text_boxes = self.__find_text_boxes(doc)
|
|
255
|
+
|
|
256
|
+
# Inject markers around each group of consecutive bordered paragraphs
|
|
257
|
+
for group in bordered_groups:
|
|
258
|
+
if group:
|
|
259
|
+
# Add start marker before first paragraph in group
|
|
260
|
+
first_para = group[0]
|
|
261
|
+
marker_p_start = self.__create_marker_paragraph(start_marker)
|
|
262
|
+
first_para._element.addprevious(marker_p_start)
|
|
263
|
+
|
|
264
|
+
# Add end marker after last paragraph in group
|
|
265
|
+
last_para = group[-1]
|
|
266
|
+
marker_p_end = self.__create_marker_paragraph(end_marker)
|
|
267
|
+
last_para._element.addnext(marker_p_end)
|
|
268
|
+
|
|
269
|
+
# Inject markers around text box content
|
|
270
|
+
# for textbox_element, para_elements in text_boxes:
|
|
271
|
+
# self.__inject_markers_for_textbox(textbox_element, para_elements, start_marker, end_marker)
|
|
272
|
+
|
|
273
|
+
# Save modified document to BytesIO
|
|
274
|
+
output = BytesIO()
|
|
275
|
+
doc.save(output)
|
|
276
|
+
output.seek(0)
|
|
277
|
+
|
|
278
|
+
return output, start_marker, end_marker
|
|
279
|
+
|
|
280
|
+
def __contains_complex_structure(self, content_html):
|
|
281
|
+
"""
|
|
282
|
+
Check if HTML content contains tables, lists, or other complex structures.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
content_html (str): HTML content to analyze.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
bool: True if content contains tables/lists, False otherwise.
|
|
289
|
+
"""
|
|
290
|
+
content_soup = BeautifulSoup(content_html, 'html.parser')
|
|
291
|
+
|
|
292
|
+
# Check for tables
|
|
293
|
+
if content_soup.find('table'):
|
|
294
|
+
return True
|
|
295
|
+
|
|
296
|
+
# Check for lists (ul, ol)
|
|
297
|
+
if content_soup.find('ul') or content_soup.find('ol'):
|
|
298
|
+
return True
|
|
299
|
+
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
def __escape_hash_symbols(self, html_content):
|
|
303
|
+
"""
|
|
304
|
+
Escape hash (#) symbols at the beginning of lines in HTML to prevent
|
|
305
|
+
them from being treated as markdown headers.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
html_content (str): HTML content.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
str: HTML with escaped hash symbols.
|
|
312
|
+
"""
|
|
313
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
|
314
|
+
|
|
315
|
+
# Process all text-containing elements
|
|
316
|
+
for element in soup.find_all(['p', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
317
|
+
if element.string:
|
|
318
|
+
text = element.string
|
|
319
|
+
# If line starts with #, escape it
|
|
320
|
+
if text.strip().startswith('#'):
|
|
321
|
+
element.string = text.replace('#', '\\#', 1)
|
|
322
|
+
|
|
323
|
+
return str(soup)
|
|
324
|
+
|
|
325
|
+
def __wrap_marked_sections_in_code_blocks(self, html, start_marker, end_marker):
|
|
326
|
+
"""
|
|
327
|
+
Find content between markers and wrap appropriately:
|
|
328
|
+
- Simple text/code → <pre><code> block
|
|
329
|
+
- Tables/lists → Custom wrapper with preserved structure
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
html (str): The HTML content from Mammoth.
|
|
333
|
+
start_marker (str): The start marker text.
|
|
334
|
+
end_marker (str): The end marker text.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
str: HTML with marked sections wrapped appropriately.
|
|
338
|
+
"""
|
|
339
|
+
import html as html_module
|
|
340
|
+
|
|
341
|
+
# Mammoth escapes < and > to < and >, so we need to escape our markers too
|
|
342
|
+
escaped_start = html_module.escape(start_marker)
|
|
343
|
+
escaped_end = html_module.escape(end_marker)
|
|
344
|
+
|
|
345
|
+
# Pattern to find content between HTML-escaped markers (including HTML tags)
|
|
346
|
+
# The markers will be in separate <p> tags, and content in between
|
|
347
|
+
pattern = re.compile(
|
|
348
|
+
f'<p>{re.escape(escaped_start)}</p>(.*?)<p>{re.escape(escaped_end)}</p>',
|
|
349
|
+
re.DOTALL
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def replace_with_appropriate_wrapper(match):
|
|
353
|
+
content = match.group(1)
|
|
354
|
+
|
|
355
|
+
# Detect if content has complex structure (tables, lists)
|
|
356
|
+
has_complex_structure = self.__contains_complex_structure(content)
|
|
357
|
+
|
|
358
|
+
if has_complex_structure:
|
|
359
|
+
# Preserve structure: keep HTML as-is, escape # symbols
|
|
360
|
+
escaped_content = self.__escape_hash_symbols(content)
|
|
361
|
+
# Wrap in a div with special class for potential custom handling
|
|
362
|
+
return f'<div class="alita-bordered-content">{escaped_content}</div>'
|
|
363
|
+
else:
|
|
364
|
+
# Simple text/code: extract as plain text and wrap in code block
|
|
365
|
+
content_soup = BeautifulSoup(content, 'html.parser')
|
|
366
|
+
|
|
367
|
+
# Extract text from each paragraph separately to preserve line breaks
|
|
368
|
+
lines = []
|
|
369
|
+
for element in content_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
370
|
+
# Replace <br /> within paragraphs with newlines
|
|
371
|
+
for br in element.find_all('br'):
|
|
372
|
+
br.replace_with('\n')
|
|
373
|
+
text = element.get_text()
|
|
374
|
+
# Preserve leading whitespace (indentation), only strip trailing
|
|
375
|
+
lines.append(text.rstrip())
|
|
376
|
+
|
|
377
|
+
# If no paragraphs found, just get all text
|
|
378
|
+
if not lines:
|
|
379
|
+
content = content.replace('<br />', '\n').replace('<br/>', '\n').replace('<br>', '\n')
|
|
380
|
+
content_text = content_soup.get_text()
|
|
381
|
+
lines = [line.rstrip() for line in content_text.split('\n')]
|
|
382
|
+
|
|
383
|
+
# Join lines, strip only leading/trailing empty lines
|
|
384
|
+
content_text = '\n'.join(lines).strip()
|
|
385
|
+
# Return as code block (need to HTML-escape the content)
|
|
386
|
+
content_escaped = html_module.escape(content_text)
|
|
387
|
+
return f'<pre><code>{content_escaped}</code></pre>'
|
|
388
|
+
|
|
389
|
+
# Replace all marked sections with appropriate wrappers
|
|
390
|
+
result_html = pattern.sub(replace_with_appropriate_wrapper, html)
|
|
391
|
+
|
|
392
|
+
return result_html
|
|
393
|
+
|
|
100
394
|
def load(self):
|
|
101
395
|
"""
|
|
102
396
|
Loads and converts the Docx file to markdown format.
|
|
@@ -131,6 +425,7 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
131
425
|
def _convert_docx_to_markdown(self, docx_file):
|
|
132
426
|
"""
|
|
133
427
|
Converts the content of a Docx file to markdown format.
|
|
428
|
+
Detects bordered content and treats it as code blocks.
|
|
134
429
|
|
|
135
430
|
Args:
|
|
136
431
|
docx_file (BinaryIO): The Docx file object.
|
|
@@ -138,11 +433,28 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
|
138
433
|
Returns:
|
|
139
434
|
str: The markdown content extracted from the Docx file.
|
|
140
435
|
"""
|
|
436
|
+
# Step 1: Detect and mark bordered content
|
|
437
|
+
# Reset stream position if needed
|
|
438
|
+
if hasattr(docx_file, 'seek'):
|
|
439
|
+
docx_file.seek(0)
|
|
440
|
+
|
|
441
|
+
marked_docx, start_marker, end_marker = self.__detect_and_mark_bordered_content(docx_file)
|
|
442
|
+
|
|
443
|
+
# Step 2: Convert marked DOCX to HTML using Mammoth
|
|
141
444
|
if self.extract_images:
|
|
142
445
|
# Extract images using the provided image handler
|
|
143
|
-
result = convert_to_html(
|
|
446
|
+
result = convert_to_html(marked_docx, convert_image=mammoth.images.img_element(self.__handle_image))
|
|
144
447
|
else:
|
|
145
448
|
# Ignore images
|
|
146
|
-
result = convert_to_html(
|
|
147
|
-
|
|
449
|
+
result = convert_to_html(marked_docx, convert_image=lambda image: "")
|
|
450
|
+
|
|
451
|
+
# Step 3: Wrap marked sections in <pre><code> tags
|
|
452
|
+
html_with_code_blocks = self.__wrap_marked_sections_in_code_blocks(
|
|
453
|
+
result.value, start_marker, end_marker
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# Step 4: Convert HTML to markdown
|
|
457
|
+
content = markdownify(html_with_code_blocks, heading_style="ATX")
|
|
458
|
+
|
|
459
|
+
# Step 5: Post-process markdown (for image transcripts, etc.)
|
|
148
460
|
return self.__postprocess_original_md(content)
|