alita-sdk 0.3.365__py3-none-any.whl → 0.3.462__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (118) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent_executor.py +144 -0
  4. alita_sdk/cli/agent_loader.py +197 -0
  5. alita_sdk/cli/agent_ui.py +166 -0
  6. alita_sdk/cli/agents.py +1069 -0
  7. alita_sdk/cli/callbacks.py +576 -0
  8. alita_sdk/cli/cli.py +159 -0
  9. alita_sdk/cli/config.py +153 -0
  10. alita_sdk/cli/formatting.py +182 -0
  11. alita_sdk/cli/mcp_loader.py +315 -0
  12. alita_sdk/cli/toolkit.py +330 -0
  13. alita_sdk/cli/toolkit_loader.py +55 -0
  14. alita_sdk/cli/tools/__init__.py +9 -0
  15. alita_sdk/cli/tools/filesystem.py +905 -0
  16. alita_sdk/configurations/bitbucket.py +95 -0
  17. alita_sdk/configurations/confluence.py +96 -1
  18. alita_sdk/configurations/gitlab.py +79 -0
  19. alita_sdk/configurations/jira.py +103 -0
  20. alita_sdk/configurations/testrail.py +88 -0
  21. alita_sdk/configurations/xray.py +93 -0
  22. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  23. alita_sdk/configurations/zephyr_essential.py +75 -0
  24. alita_sdk/runtime/clients/artifact.py +1 -1
  25. alita_sdk/runtime/clients/client.py +47 -10
  26. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  27. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  28. alita_sdk/runtime/clients/sandbox_client.py +373 -0
  29. alita_sdk/runtime/langchain/assistant.py +70 -41
  30. alita_sdk/runtime/langchain/constants.py +6 -1
  31. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  32. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
  33. alita_sdk/runtime/langchain/document_loaders/constants.py +73 -100
  34. alita_sdk/runtime/langchain/langraph_agent.py +164 -38
  35. alita_sdk/runtime/langchain/utils.py +43 -7
  36. alita_sdk/runtime/models/mcp_models.py +61 -0
  37. alita_sdk/runtime/toolkits/__init__.py +24 -0
  38. alita_sdk/runtime/toolkits/application.py +8 -1
  39. alita_sdk/runtime/toolkits/artifact.py +5 -6
  40. alita_sdk/runtime/toolkits/mcp.py +895 -0
  41. alita_sdk/runtime/toolkits/tools.py +140 -50
  42. alita_sdk/runtime/tools/__init__.py +7 -2
  43. alita_sdk/runtime/tools/application.py +7 -0
  44. alita_sdk/runtime/tools/function.py +94 -5
  45. alita_sdk/runtime/tools/graph.py +10 -4
  46. alita_sdk/runtime/tools/image_generation.py +104 -8
  47. alita_sdk/runtime/tools/llm.py +204 -114
  48. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  49. alita_sdk/runtime/tools/mcp_remote_tool.py +166 -0
  50. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  51. alita_sdk/runtime/tools/sandbox.py +180 -79
  52. alita_sdk/runtime/tools/vectorstore.py +22 -21
  53. alita_sdk/runtime/tools/vectorstore_base.py +79 -26
  54. alita_sdk/runtime/utils/mcp_oauth.py +164 -0
  55. alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
  56. alita_sdk/runtime/utils/streamlit.py +34 -3
  57. alita_sdk/runtime/utils/toolkit_utils.py +14 -4
  58. alita_sdk/runtime/utils/utils.py +1 -0
  59. alita_sdk/tools/__init__.py +48 -31
  60. alita_sdk/tools/ado/repos/__init__.py +1 -0
  61. alita_sdk/tools/ado/test_plan/__init__.py +1 -1
  62. alita_sdk/tools/ado/wiki/__init__.py +1 -5
  63. alita_sdk/tools/ado/work_item/__init__.py +1 -5
  64. alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
  65. alita_sdk/tools/base_indexer_toolkit.py +194 -112
  66. alita_sdk/tools/bitbucket/__init__.py +1 -0
  67. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  68. alita_sdk/tools/code/sonar/__init__.py +1 -1
  69. alita_sdk/tools/code_indexer_toolkit.py +15 -5
  70. alita_sdk/tools/confluence/__init__.py +2 -2
  71. alita_sdk/tools/confluence/api_wrapper.py +110 -63
  72. alita_sdk/tools/confluence/loader.py +10 -0
  73. alita_sdk/tools/elitea_base.py +22 -22
  74. alita_sdk/tools/github/__init__.py +2 -2
  75. alita_sdk/tools/gitlab/__init__.py +2 -1
  76. alita_sdk/tools/gitlab/api_wrapper.py +11 -7
  77. alita_sdk/tools/gitlab_org/__init__.py +1 -2
  78. alita_sdk/tools/google_places/__init__.py +2 -1
  79. alita_sdk/tools/jira/__init__.py +1 -0
  80. alita_sdk/tools/jira/api_wrapper.py +1 -1
  81. alita_sdk/tools/memory/__init__.py +1 -1
  82. alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
  83. alita_sdk/tools/openapi/__init__.py +10 -1
  84. alita_sdk/tools/pandas/__init__.py +1 -1
  85. alita_sdk/tools/postman/__init__.py +2 -1
  86. alita_sdk/tools/postman/api_wrapper.py +18 -8
  87. alita_sdk/tools/postman/postman_analysis.py +8 -1
  88. alita_sdk/tools/pptx/__init__.py +2 -2
  89. alita_sdk/tools/qtest/__init__.py +3 -3
  90. alita_sdk/tools/qtest/api_wrapper.py +1708 -76
  91. alita_sdk/tools/rally/__init__.py +1 -2
  92. alita_sdk/tools/report_portal/__init__.py +1 -0
  93. alita_sdk/tools/salesforce/__init__.py +1 -0
  94. alita_sdk/tools/servicenow/__init__.py +2 -3
  95. alita_sdk/tools/sharepoint/__init__.py +1 -0
  96. alita_sdk/tools/sharepoint/api_wrapper.py +125 -34
  97. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  98. alita_sdk/tools/sharepoint/utils.py +8 -2
  99. alita_sdk/tools/slack/__init__.py +1 -0
  100. alita_sdk/tools/sql/__init__.py +2 -1
  101. alita_sdk/tools/sql/api_wrapper.py +71 -23
  102. alita_sdk/tools/testio/__init__.py +1 -0
  103. alita_sdk/tools/testrail/__init__.py +1 -3
  104. alita_sdk/tools/utils/__init__.py +17 -0
  105. alita_sdk/tools/utils/content_parser.py +35 -24
  106. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +67 -21
  107. alita_sdk/tools/xray/__init__.py +2 -1
  108. alita_sdk/tools/zephyr/__init__.py +2 -1
  109. alita_sdk/tools/zephyr_enterprise/__init__.py +1 -0
  110. alita_sdk/tools/zephyr_essential/__init__.py +1 -0
  111. alita_sdk/tools/zephyr_scale/__init__.py +1 -0
  112. alita_sdk/tools/zephyr_squad/__init__.py +1 -0
  113. {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/METADATA +8 -2
  114. {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/RECORD +118 -93
  115. alita_sdk-0.3.462.dist-info/entry_points.txt +2 -0
  116. {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/WHEEL +0 -0
  117. {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/licenses/LICENSE +0 -0
  118. {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@ from .constants import REACT_ADDON, REACT_VARS, XML_ADDON
17
17
  from .chat_message_template import Jinja2TemplatedChatMessagesTemplate
18
18
  from ..tools.echo import EchoTool
19
19
  from langchain_core.tools import BaseTool, ToolException
20
+ from jinja2 import Environment, DebugUndefined
20
21
 
21
22
  logger = logging.getLogger(__name__)
22
23
 
@@ -29,7 +30,9 @@ class Assistant:
29
30
  app_type: str = "openai",
30
31
  tools: Optional[list] = [],
31
32
  memory: Optional[Any] = None,
32
- store: Optional[BaseStore] = None):
33
+ store: Optional[BaseStore] = None,
34
+ debug_mode: Optional[bool] = False,
35
+ mcp_tokens: Optional[dict] = None):
33
36
 
34
37
  self.app_type = app_type
35
38
  self.memory = memory
@@ -77,11 +80,24 @@ class Assistant:
77
80
  else:
78
81
  # For predict agents, initialize memory store to None since they don't use memory
79
82
  self.store = None
80
-
83
+
81
84
  # Lazy import to avoid circular dependency
82
85
  from ..toolkits.tools import get_tools
83
-
84
- self.tools = get_tools(data['tools'], alita_client=alita, llm=self.client, memory_store=self.store)
86
+ version_tools = data['tools']
87
+ # Handle internal tools
88
+ meta = data.get('meta', {})
89
+ if meta.get("internal_tools"):
90
+ for internal_tool_name in meta.get("internal_tools"):
91
+ version_tools.append({"type": "internal_tool", "name": internal_tool_name})
92
+
93
+ self.tools = get_tools(
94
+ version_tools,
95
+ alita_client=alita,
96
+ llm=self.client,
97
+ memory_store=self.store,
98
+ debug_mode=debug_mode,
99
+ mcp_tokens=mcp_tokens
100
+ )
85
101
  if tools:
86
102
  self.tools += tools
87
103
  # Handle prompt setup
@@ -111,13 +127,18 @@ class Assistant:
111
127
  messages.extend(chat_history)
112
128
  self.prompt = Jinja2TemplatedChatMessagesTemplate(messages=messages)
113
129
  if input_variables:
114
- self.prompt.input_variables = input_variables
130
+ if hasattr(self.prompt, 'input_variables') and self.prompt.input_variables is not None:
131
+ self.prompt.input_variables.extend(input_variables)
132
+ else:
133
+ self.prompt.input_variables = input_variables
115
134
  if variables:
116
135
  self.prompt.partial_variables = variables
117
136
  try:
118
- logger.info(f"Client was created with client setting: temperature - {self.client._get_model_default_parameters}")
137
+ logger.info(
138
+ f"Client was created with client setting: temperature - {self.client._get_model_default_parameters}")
119
139
  except Exception as e:
120
- logger.info(f"Client was created with client setting: temperature - {self.client.temperature} : {self.client.max_tokens}")
140
+ logger.info(
141
+ f"Client was created with client setting: temperature - {self.client.temperature} : {self.client.max_tokens}")
121
142
 
122
143
  def _configure_store(self, memory_tool: dict | None) -> None:
123
144
  """
@@ -134,11 +155,9 @@ class Assistant:
134
155
  def runnable(self):
135
156
  if self.app_type == 'pipeline':
136
157
  return self.pipeline()
137
- elif self.app_type == 'openai':
138
- return self.getOpenAIToolsAgentExecutor()
139
158
  elif self.app_type == 'xml':
140
159
  return self.getXMLAgentExecutor()
141
- elif self.app_type in ['predict', 'react']:
160
+ elif self.app_type in ['predict', 'react', 'openai']:
142
161
  return self.getLangGraphReactAgent()
143
162
  else:
144
163
  self.tools = [EchoTool()] + self.tools
@@ -156,7 +175,6 @@ class Assistant:
156
175
  agent = create_json_chat_agent(llm=self.client, tools=simple_tools, prompt=self.prompt)
157
176
  return self._agent_executor(agent)
158
177
 
159
-
160
178
  def getXMLAgentExecutor(self):
161
179
  # Exclude compiled graph runnables from simple tool agents
162
180
  simple_tools = [t for t in self.tools if isinstance(t, (BaseTool, CompiledStateGraph))]
@@ -177,34 +195,6 @@ class Assistant:
177
195
  # Exclude compiled graph runnables from simple tool agents
178
196
  simple_tools = [t for t in self.tools if isinstance(t, (BaseTool, CompiledStateGraph))]
179
197
 
180
- # Add sandbox tool by default for react agents
181
- try:
182
- from ..tools.sandbox import create_sandbox_tool
183
- sandbox_tool = create_sandbox_tool(stateful=False, allow_net=True)
184
- simple_tools.append(sandbox_tool)
185
- logger.info("Added PyodideSandboxTool to react agent")
186
- except ImportError as e:
187
- logger.warning(f"Failed to add PyodideSandboxTool: {e}. Install langchain-sandbox to enable this feature.")
188
- except RuntimeError as e:
189
- if "Deno" in str(e):
190
- logger.warning("Failed to add PyodideSandboxTool: Deno is required. Install from https://docs.deno.com/runtime/getting_started/installation/")
191
- else:
192
- logger.warning(f"Failed to add PyodideSandboxTool: {e}")
193
- except Exception as e:
194
- logger.error(f"Error adding PyodideSandboxTool: {e}")
195
-
196
- # Add image generation tool if model is configured
197
- if self.alita_client.model_image_generation is not None:
198
- try:
199
- from ..tools.image_generation import (
200
- create_image_generation_tool
201
- )
202
- image_tool = create_image_generation_tool(self.alita_client)
203
- simple_tools.append(image_tool)
204
- logger.info("Added ImageGenerationTool to react agent")
205
- except Exception as e:
206
- logger.error(f"Error adding ImageGenerationTool: {e}")
207
-
208
198
  # Set up memory/checkpointer if available
209
199
  checkpointer = None
210
200
  if self.memory is not None:
@@ -238,6 +228,10 @@ class Assistant:
238
228
  # Only use prompt_instructions if explicitly specified (for predict app_type)
239
229
  if self.app_type == "predict" and isinstance(self.prompt, str):
240
230
  prompt_instructions = self.prompt
231
+
232
+ # take the system message from the openai prompt as a prompt instructions
233
+ if self.app_type == "openai" and hasattr(self.prompt, 'messages'):
234
+ prompt_instructions = self.__take_prompt_from_openai_messages()
241
235
 
242
236
  # Create a unified YAML schema with conditional tool binding
243
237
  # Build the base node configuration
@@ -279,6 +273,9 @@ class Assistant:
279
273
  schema_dict = {
280
274
  'name': 'react_agent',
281
275
  'state': {
276
+ 'input': {
277
+ 'type': 'str'
278
+ },
282
279
  'messages': state_messages_config
283
280
  },
284
281
  'nodes': [{
@@ -287,6 +284,21 @@ class Assistant:
287
284
  'prompt': {
288
285
  'template': escaped_prompt
289
286
  },
287
+ 'input_mapping': {
288
+ 'system': {
289
+ 'type': 'fixed',
290
+ 'value': escaped_prompt
291
+ },
292
+ 'task': {
293
+ 'type': 'variable',
294
+ 'value': 'input'
295
+ },
296
+ 'chat_history': {
297
+ 'type': 'variable',
298
+ 'value': 'messages'
299
+ }
300
+ },
301
+ 'step_limit': self.max_iterations,
290
302
  'input': ['messages'],
291
303
  'output': ['messages'],
292
304
  'transition': 'END'
@@ -311,7 +323,9 @@ class Assistant:
311
323
  memory=checkpointer,
312
324
  store=self.store,
313
325
  debug=False,
314
- for_subgraph=False
326
+ for_subgraph=False,
327
+ alita_client=self.alita_client,
328
+ steps_limit=self.max_iterations
315
329
  )
316
330
 
317
331
  return agent
@@ -325,7 +339,9 @@ class Assistant:
325
339
  #
326
340
  agent = create_graph(
327
341
  client=self.client, tools=self.tools,
328
- yaml_schema=self.prompt, memory=memory
342
+ yaml_schema=self.prompt, memory=memory,
343
+ alita_client=self.alita_client,
344
+ steps_limit=self.max_iterations
329
345
  )
330
346
  #
331
347
  return agent
@@ -336,3 +352,16 @@ class Assistant:
336
352
 
337
353
  def predict(self, messages: list[BaseMessage]):
338
354
  return self.client.invoke(messages)
355
+
356
+ def __take_prompt_from_openai_messages(self):
357
+ if self.prompt and self.prompt.messages:
358
+ for message in self.prompt.messages:
359
+ # we don't need any message placeholder from the openai agent prompt
360
+ if hasattr(message, 'variable_name'):
361
+ continue
362
+ # take only the content of the system message from the openai prompt
363
+ if isinstance(message, SystemMessage):
364
+ environment = Environment(undefined=DebugUndefined)
365
+ template = environment.from_string(message.content)
366
+ return template.render(self.prompt.partial_variables)
367
+ return None
@@ -27,7 +27,7 @@ Use this if you want to respond directly to the human. Markdown code snippet for
27
27
  ```json
28
28
  {
29
29
  "action": "Final Answer",
30
- "action_input": string \ You should put what you want to return to use here
30
+ "action_input": string // You should put what you want to return to use here
31
31
  }
32
32
  ```
33
33
 
@@ -80,3 +80,8 @@ DEFAULT_MULTIMODAL_PROMPT = """
80
80
  - Maintain a structured and logical flow in the output to enhance understanding and usability.
81
81
  - Avoid presenting the entire prompt for user.
82
82
  """
83
+
84
+ ELITEA_RS = "elitea_response"
85
+ PRINTER = "printer"
86
+ PRINTER_NODE_RS = "printer_output"
87
+ PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
@@ -1,4 +1,5 @@
1
1
  import re
2
+ import uuid
2
3
  from io import BytesIO
3
4
 
4
5
  import mammoth.images
@@ -8,6 +9,9 @@ from langchain_core.document_loaders import BaseLoader
8
9
  from langchain_core.documents import Document
9
10
  from mammoth import convert_to_html
10
11
  from markdownify import markdownify
12
+ from docx import Document as DocxDocument
13
+ from docx.oxml.ns import qn
14
+ from bs4 import BeautifulSoup
11
15
 
12
16
  from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_by_headers_chunker
13
17
  from .utils import perform_llm_prediction_for_image_bytes
@@ -17,6 +21,7 @@ class AlitaDocxMammothLoader(BaseLoader):
17
21
  """
18
22
  Loader for Docx files using Mammoth to convert to HTML, with image handling,
19
23
  and then Markdownify to convert HTML to markdown.
24
+ Detects bordered paragraphs and text boxes and treats them as code blocks.
20
25
  """
21
26
  def __init__(self, **kwargs):
22
27
  """
@@ -97,6 +102,295 @@ class AlitaDocxMammothLoader(BaseLoader):
97
102
  new_md = pattern.sub(replace_placeholder, original_md)
98
103
  return new_md
99
104
 
105
+ def __has_border(self, paragraph):
106
+ """
107
+ Check if a paragraph has border formatting.
108
+
109
+ Args:
110
+ paragraph: A python-docx Paragraph object.
111
+
112
+ Returns:
113
+ bool: True if paragraph has any border, False otherwise.
114
+ """
115
+ pPr = paragraph._element.pPr
116
+ if pPr is not None:
117
+ pBdr = pPr.find(qn('w:pBdr'))
118
+ if pBdr is not None:
119
+ # Check if any border side exists (top, bottom, left, right)
120
+ for side in ['top', 'bottom', 'left', 'right']:
121
+ border = pBdr.find(qn(f'w:{side}'))
122
+ if border is not None:
123
+ # Check if border is not "none" or has a width
124
+ val = border.get(qn('w:val'))
125
+ if val and val != 'none':
126
+ return True
127
+ return False
128
+
129
+ def __find_text_boxes(self, doc):
130
+ """
131
+ Find all text boxes in document by searching OOXML structure.
132
+ Text boxes are typically in w:txbxContent elements.
133
+
134
+ Args:
135
+ doc: A python-docx Document object.
136
+
137
+ Returns:
138
+ list: List of tuples (element, paragraphs_inside_textbox).
139
+ """
140
+ text_boxes = []
141
+
142
+ # Iterate through document body XML to find text box content elements
143
+ for element in doc.element.body.iter():
144
+ # Look for text box content elements
145
+ if element.tag.endswith('txbxContent'):
146
+ # Collect all paragraphs inside this text box
147
+ txbx_paragraphs = []
148
+ for txbx_para_element in element.iter():
149
+ if txbx_para_element.tag.endswith('p'):
150
+ txbx_paragraphs.append(txbx_para_element)
151
+
152
+ if txbx_paragraphs:
153
+ text_boxes.append((element, txbx_paragraphs))
154
+
155
+ return text_boxes
156
+
157
+ def __create_marker_paragraph(self, marker_text):
158
+ """
159
+ Create a paragraph element with marker text.
160
+
161
+ Args:
162
+ marker_text (str): The marker text to insert.
163
+
164
+ Returns:
165
+ Element: An OOXML paragraph element.
166
+ """
167
+ from docx.oxml import OxmlElement
168
+
169
+ p = OxmlElement('w:p')
170
+ r = OxmlElement('w:r')
171
+ t = OxmlElement('w:t')
172
+ t.text = marker_text
173
+ r.append(t)
174
+ p.append(r)
175
+ return p
176
+
177
+ def __inject_markers_for_paragraph(self, paragraph, start_marker, end_marker):
178
+ """
179
+ Inject marker paragraphs before and after a bordered paragraph.
180
+
181
+ Args:
182
+ paragraph: A python-docx Paragraph object.
183
+ start_marker (str): The start marker text.
184
+ end_marker (str): The end marker text.
185
+ """
186
+ # Insert start marker paragraph before
187
+ marker_p_start = self.__create_marker_paragraph(start_marker)
188
+ paragraph._element.addprevious(marker_p_start)
189
+
190
+ # Insert end marker paragraph after
191
+ marker_p_end = self.__create_marker_paragraph(end_marker)
192
+ paragraph._element.addnext(marker_p_end)
193
+
194
+ def __inject_markers_for_textbox(self, textbox_element, paragraph_elements, start_marker, end_marker):
195
+ """
196
+ Inject markers around text box content.
197
+
198
+ Args:
199
+ textbox_element: The w:txbxContent element.
200
+ paragraph_elements: List of paragraph elements inside the text box.
201
+ start_marker (str): The start marker text.
202
+ end_marker (str): The end marker text.
203
+ """
204
+ if not paragraph_elements:
205
+ return
206
+
207
+ # Insert start marker before first paragraph in text box
208
+ first_para = paragraph_elements[0]
209
+ marker_p_start = self.__create_marker_paragraph(start_marker)
210
+ first_para.addprevious(marker_p_start)
211
+
212
+ # Insert end marker after last paragraph in text box
213
+ last_para = paragraph_elements[-1]
214
+ marker_p_end = self.__create_marker_paragraph(end_marker)
215
+ last_para.addnext(marker_p_end)
216
+
217
+ def __detect_and_mark_bordered_content(self, docx_stream):
218
+ """
219
+ Detects bordered paragraphs and text boxes, injects unique markers around them.
220
+ Groups consecutive bordered paragraphs into single code blocks.
221
+
222
+ Args:
223
+ docx_stream: A file-like object containing the DOCX document.
224
+
225
+ Returns:
226
+ tuple: (modified_docx_stream, start_marker, end_marker)
227
+ """
228
+ # Load document with python-docx
229
+ doc = DocxDocument(docx_stream)
230
+
231
+ # Generate unique markers to avoid conflicts with document content
232
+ unique_id = uuid.uuid4().hex[:8]
233
+ start_marker = f"<<<BORDERED_BLOCK_START_{unique_id}>>>"
234
+ end_marker = f"<<<BORDERED_BLOCK_END_{unique_id}>>>"
235
+
236
+ # Group consecutive bordered paragraphs together
237
+ bordered_groups = []
238
+ current_group = []
239
+
240
+ for para in doc.paragraphs:
241
+ if self.__has_border(para):
242
+ current_group.append(para)
243
+ else:
244
+ if current_group:
245
+ # End of a bordered group
246
+ bordered_groups.append(current_group)
247
+ current_group = []
248
+
249
+ # Don't forget the last group if document ends with bordered paragraphs
250
+ if current_group:
251
+ bordered_groups.append(current_group)
252
+
253
+ # Collect all text boxes
254
+ # text_boxes = self.__find_text_boxes(doc)
255
+
256
+ # Inject markers around each group of consecutive bordered paragraphs
257
+ for group in bordered_groups:
258
+ if group:
259
+ # Add start marker before first paragraph in group
260
+ first_para = group[0]
261
+ marker_p_start = self.__create_marker_paragraph(start_marker)
262
+ first_para._element.addprevious(marker_p_start)
263
+
264
+ # Add end marker after last paragraph in group
265
+ last_para = group[-1]
266
+ marker_p_end = self.__create_marker_paragraph(end_marker)
267
+ last_para._element.addnext(marker_p_end)
268
+
269
+ # Inject markers around text box content
270
+ # for textbox_element, para_elements in text_boxes:
271
+ # self.__inject_markers_for_textbox(textbox_element, para_elements, start_marker, end_marker)
272
+
273
+ # Save modified document to BytesIO
274
+ output = BytesIO()
275
+ doc.save(output)
276
+ output.seek(0)
277
+
278
+ return output, start_marker, end_marker
279
+
280
+ def __contains_complex_structure(self, content_html):
281
+ """
282
+ Check if HTML content contains tables, lists, or other complex structures.
283
+
284
+ Args:
285
+ content_html (str): HTML content to analyze.
286
+
287
+ Returns:
288
+ bool: True if content contains tables/lists, False otherwise.
289
+ """
290
+ content_soup = BeautifulSoup(content_html, 'html.parser')
291
+
292
+ # Check for tables
293
+ if content_soup.find('table'):
294
+ return True
295
+
296
+ # Check for lists (ul, ol)
297
+ if content_soup.find('ul') or content_soup.find('ol'):
298
+ return True
299
+
300
+ return False
301
+
302
+ def __escape_hash_symbols(self, html_content):
303
+ """
304
+ Escape hash (#) symbols at the beginning of lines in HTML to prevent
305
+ them from being treated as markdown headers.
306
+
307
+ Args:
308
+ html_content (str): HTML content.
309
+
310
+ Returns:
311
+ str: HTML with escaped hash symbols.
312
+ """
313
+ soup = BeautifulSoup(html_content, 'html.parser')
314
+
315
+ # Process all text-containing elements
316
+ for element in soup.find_all(['p', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
317
+ if element.string:
318
+ text = element.string
319
+ # If line starts with #, escape it
320
+ if text.strip().startswith('#'):
321
+ element.string = text.replace('#', '\\#', 1)
322
+
323
+ return str(soup)
324
+
325
+ def __wrap_marked_sections_in_code_blocks(self, html, start_marker, end_marker):
326
+ """
327
+ Find content between markers and wrap appropriately:
328
+ - Simple text/code → <pre><code> block
329
+ - Tables/lists → Custom wrapper with preserved structure
330
+
331
+ Args:
332
+ html (str): The HTML content from Mammoth.
333
+ start_marker (str): The start marker text.
334
+ end_marker (str): The end marker text.
335
+
336
+ Returns:
337
+ str: HTML with marked sections wrapped appropriately.
338
+ """
339
+ import html as html_module
340
+
341
+ # Mammoth escapes < and > to &lt; and &gt;, so we need to escape our markers too
342
+ escaped_start = html_module.escape(start_marker)
343
+ escaped_end = html_module.escape(end_marker)
344
+
345
+ # Pattern to find content between HTML-escaped markers (including HTML tags)
346
+ # The markers will be in separate <p> tags, and content in between
347
+ pattern = re.compile(
348
+ f'<p>{re.escape(escaped_start)}</p>(.*?)<p>{re.escape(escaped_end)}</p>',
349
+ re.DOTALL
350
+ )
351
+
352
+ def replace_with_appropriate_wrapper(match):
353
+ content = match.group(1)
354
+
355
+ # Detect if content has complex structure (tables, lists)
356
+ has_complex_structure = self.__contains_complex_structure(content)
357
+
358
+ if has_complex_structure:
359
+ # Preserve structure: keep HTML as-is, escape # symbols
360
+ escaped_content = self.__escape_hash_symbols(content)
361
+ # Wrap in a div with special class for potential custom handling
362
+ return f'<div class="alita-bordered-content">{escaped_content}</div>'
363
+ else:
364
+ # Simple text/code: extract as plain text and wrap in code block
365
+ content_soup = BeautifulSoup(content, 'html.parser')
366
+
367
+ # Extract text from each paragraph separately to preserve line breaks
368
+ lines = []
369
+ for element in content_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
370
+ # Replace <br /> within paragraphs with newlines
371
+ for br in element.find_all('br'):
372
+ br.replace_with('\n')
373
+ text = element.get_text()
374
+ # Preserve leading whitespace (indentation), only strip trailing
375
+ lines.append(text.rstrip())
376
+
377
+ # If no paragraphs found, just get all text
378
+ if not lines:
379
+ content = content.replace('<br />', '\n').replace('<br/>', '\n').replace('<br>', '\n')
380
+ content_text = content_soup.get_text()
381
+ lines = [line.rstrip() for line in content_text.split('\n')]
382
+
383
+ # Join lines, strip only leading/trailing empty lines
384
+ content_text = '\n'.join(lines).strip()
385
+ # Return as code block (need to HTML-escape the content)
386
+ content_escaped = html_module.escape(content_text)
387
+ return f'<pre><code>{content_escaped}</code></pre>'
388
+
389
+ # Replace all marked sections with appropriate wrappers
390
+ result_html = pattern.sub(replace_with_appropriate_wrapper, html)
391
+
392
+ return result_html
393
+
100
394
  def load(self):
101
395
  """
102
396
  Loads and converts the Docx file to markdown format.
@@ -131,6 +425,7 @@ class AlitaDocxMammothLoader(BaseLoader):
131
425
  def _convert_docx_to_markdown(self, docx_file):
132
426
  """
133
427
  Converts the content of a Docx file to markdown format.
428
+ Detects bordered content and treats it as code blocks.
134
429
 
135
430
  Args:
136
431
  docx_file (BinaryIO): The Docx file object.
@@ -138,11 +433,28 @@ class AlitaDocxMammothLoader(BaseLoader):
138
433
  Returns:
139
434
  str: The markdown content extracted from the Docx file.
140
435
  """
436
+ # Step 1: Detect and mark bordered content
437
+ # Reset stream position if needed
438
+ if hasattr(docx_file, 'seek'):
439
+ docx_file.seek(0)
440
+
441
+ marked_docx, start_marker, end_marker = self.__detect_and_mark_bordered_content(docx_file)
442
+
443
+ # Step 2: Convert marked DOCX to HTML using Mammoth
141
444
  if self.extract_images:
142
445
  # Extract images using the provided image handler
143
- result = convert_to_html(docx_file, convert_image=mammoth.images.img_element(self.__handle_image))
446
+ result = convert_to_html(marked_docx, convert_image=mammoth.images.img_element(self.__handle_image))
144
447
  else:
145
448
  # Ignore images
146
- result = convert_to_html(docx_file, convert_image=lambda image: "")
147
- content = markdownify(result.value, heading_style="ATX")
449
+ result = convert_to_html(marked_docx, convert_image=lambda image: "")
450
+
451
+ # Step 3: Wrap marked sections in <pre><code> tags
452
+ html_with_code_blocks = self.__wrap_marked_sections_in_code_blocks(
453
+ result.value, start_marker, end_marker
454
+ )
455
+
456
+ # Step 4: Convert HTML to markdown
457
+ content = markdownify(html_with_code_blocks, heading_style="ATX")
458
+
459
+ # Step 5: Post-process markdown (for image transcripts, etc.)
148
460
  return self.__postprocess_original_md(content)
@@ -30,7 +30,10 @@ class AlitaJSONLoader(BaseLoader):
30
30
  with open(self.file_path, encoding=self.encoding) as f:
31
31
  return json.load(f)
32
32
  elif hasattr(self, 'file_content') and self.file_content:
33
- return json.load(self.file_content)
33
+ if isinstance(self.file_content, bytes):
34
+ return json.loads(self.file_content.decode(self.encoding))
35
+ else:
36
+ return json.load(self.file_content)
34
37
  else:
35
38
  raise ValueError("Neither file_path nor file_content is provided.")
36
39