camel-ai 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (47) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_types.py +1 -1
  3. camel/agents/_utils.py +4 -4
  4. camel/agents/chat_agent.py +174 -29
  5. camel/configs/__init__.py +3 -0
  6. camel/configs/openai_config.py +20 -16
  7. camel/configs/openrouter_config.py +106 -0
  8. camel/datasets/base_generator.py +188 -27
  9. camel/datasets/few_shot_generator.py +2 -5
  10. camel/environments/single_step.py +1 -7
  11. camel/memories/agent_memories.py +49 -2
  12. camel/memories/base.py +23 -1
  13. camel/memories/blocks/chat_history_block.py +2 -1
  14. camel/memories/records.py +5 -0
  15. camel/models/__init__.py +2 -0
  16. camel/models/gemini_model.py +36 -0
  17. camel/models/groq_model.py +6 -3
  18. camel/models/model_factory.py +3 -0
  19. camel/models/openrouter_model.py +204 -0
  20. camel/models/stub_model.py +25 -0
  21. camel/retrievers/vector_retriever.py +12 -7
  22. camel/storages/__init__.py +2 -0
  23. camel/storages/key_value_storages/__init__.py +4 -1
  24. camel/storages/key_value_storages/json.py +3 -7
  25. camel/storages/key_value_storages/mem0_cloud.py +224 -0
  26. camel/storages/vectordb_storages/base.py +5 -1
  27. camel/storages/vectordb_storages/qdrant.py +3 -3
  28. camel/toolkits/__init__.py +2 -1
  29. camel/toolkits/browser_toolkit.py +43 -0
  30. camel/toolkits/code_execution.py +2 -1
  31. camel/toolkits/mcp_toolkit.py +30 -1
  32. camel/toolkits/memory_toolkit.py +129 -0
  33. camel/types/enums.py +24 -0
  34. camel/types/unified_model_type.py +5 -0
  35. camel/utils/chunker/__init__.py +22 -0
  36. camel/utils/chunker/base.py +24 -0
  37. camel/utils/chunker/code_chunker.py +193 -0
  38. camel/utils/chunker/uio_chunker.py +66 -0
  39. camel/utils/token_counting.py +133 -0
  40. camel/verifiers/__init__.py +1 -2
  41. camel/verifiers/base.py +133 -96
  42. camel/verifiers/models.py +0 -12
  43. camel/verifiers/python_verifier.py +25 -14
  44. {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/METADATA +3 -1
  45. {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/RECORD +47 -39
  46. {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/WHEEL +0 -0
  47. {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/licenses/LICENSE +0 -0
@@ -438,6 +438,7 @@ class BaseBrowser:
438
438
  sync_playwright,
439
439
  )
440
440
 
441
+ self._ensure_browser_installed()
441
442
  self.history: list = []
442
443
  self.headless = headless
443
444
  self.playwright = sync_playwright().start()
@@ -914,6 +915,48 @@ class BaseBrowser:
914
915
  markdown_content = html2text(html_content)
915
916
  return markdown_content
916
917
 
918
+ def _ensure_browser_installed(self) -> None:
919
+ r"""Ensure the browser is installed."""
920
+ import platform
921
+ import subprocess
922
+ import sys
923
+
924
+ try:
925
+ from playwright.sync_api import sync_playwright
926
+
927
+ with sync_playwright() as p:
928
+ browser = p.chromium.launch()
929
+ browser.close()
930
+ except Exception:
931
+ logger.info("Installing Chromium browser...")
932
+ try:
933
+ subprocess.run(
934
+ [
935
+ sys.executable,
936
+ "-m",
937
+ "playwright",
938
+ "install",
939
+ "chromium",
940
+ ],
941
+ check=True,
942
+ capture_output=True,
943
+ )
944
+ if platform.system().lower() == "linux":
945
+ subprocess.run(
946
+ [
947
+ sys.executable,
948
+ "-m",
949
+ "playwright",
950
+ "install-deps",
951
+ "chromium",
952
+ ],
953
+ check=True,
954
+ capture_output=True,
955
+ )
956
+ logger.info("Chromium browser installation completed")
957
+ except subprocess.CalledProcessError as e:
958
+ raise RuntimeError(f"Failed to install browser: {e.stderr}")
959
+
917
960
 
918
961
  class BrowserToolkit(BaseToolkit):
919
962
  r"""A class for browsing the web and interacting with web pages.
@@ -29,6 +29,7 @@ class CodeExecutionToolkit(BaseToolkit):
29
29
 
30
30
  Args:
31
31
  sandbox (str): The environment type used to execute code.
32
+ (default: `subprocess`)
32
33
  verbose (bool): Whether to print the output of the code execution.
33
34
  (default: :obj:`False`)
34
35
  unsafe_mode (bool): If `True`, the interpreter runs the code
@@ -43,7 +44,7 @@ class CodeExecutionToolkit(BaseToolkit):
43
44
  self,
44
45
  sandbox: Literal[
45
46
  "internal_python", "jupyter", "docker", "subprocess", "e2b"
46
- ] = "internal_python",
47
+ ] = "subprocess",
47
48
  verbose: bool = False,
48
49
  unsafe_mode: bool = False,
49
50
  import_white_list: Optional[List[str]] = None,
@@ -56,6 +56,8 @@ class _MCPServer(BaseToolkit):
56
56
  env (Dict[str, str]): Environment variables for the stdio mode command.
57
57
  (default: :obj:`'None'`)
58
58
  timeout (Optional[float]): Connection timeout. (default: :obj:`'None'`)
59
+ headers (Dict[str, str]): Headers for the HTTP request.
60
+ (default: :obj:`'None'`)
59
61
  """
60
62
 
61
63
  def __init__(
@@ -64,6 +66,7 @@ class _MCPServer(BaseToolkit):
64
66
  args: Optional[List[str]] = None,
65
67
  env: Optional[Dict[str, str]] = None,
66
68
  timeout: Optional[float] = None,
69
+ headers: Optional[Dict[str, str]] = None,
67
70
  ):
68
71
  from mcp import Tool
69
72
  from mcp.client.session import ClientSession
@@ -73,6 +76,7 @@ class _MCPServer(BaseToolkit):
73
76
  self.command_or_url = command_or_url
74
77
  self.args = args or []
75
78
  self.env = env or {}
79
+ self.headers = headers or {}
76
80
 
77
81
  self._mcp_tools: List[Tool] = []
78
82
  self._session: Optional['ClientSession'] = None
@@ -99,7 +103,10 @@ class _MCPServer(BaseToolkit):
99
103
  read_stream,
100
104
  write_stream,
101
105
  ) = await self._exit_stack.enter_async_context(
102
- sse_client(self.command_or_url)
106
+ sse_client(
107
+ self.command_or_url,
108
+ headers=self.headers,
109
+ )
103
110
  )
104
111
  else:
105
112
  command = self.command_or_url
@@ -343,6 +350,27 @@ class MCPToolkit(BaseToolkit):
343
350
  Either `servers` or `config_path` must be provided. If both are
344
351
  provided, servers from both sources will be combined.
345
352
 
353
+ For web servers in the config file, you can specify authorization
354
+ headers using the "headers" field to connect to protected MCP server
355
+ endpoints.
356
+
357
+ Example configuration:
358
+
359
+ .. code-block:: json
360
+
361
+ {
362
+ "mcpWebServers": {
363
+ "protected-server": {
364
+ "url": "https://example.com/mcp",
365
+ "timeout": 30,
366
+ "headers": {
367
+ "Authorization": "Bearer YOUR_TOKEN",
368
+ "X-API-Key": "YOUR_API_KEY"
369
+ }
370
+ }
371
+ }
372
+ }
373
+
346
374
  Attributes:
347
375
  servers (List[_MCPServer]): List of _MCPServer instances being managed.
348
376
  """
@@ -442,6 +470,7 @@ class MCPToolkit(BaseToolkit):
442
470
  server = _MCPServer(
443
471
  command_or_url=cfg["url"],
444
472
  timeout=cfg.get("timeout", None),
473
+ headers=cfg.get("headers", {}),
445
474
  )
446
475
  all_servers.append(server)
447
476
 
@@ -0,0 +1,129 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ import json
15
+ from typing import TYPE_CHECKING, Optional
16
+
17
+ from camel.memories import (
18
+ ChatHistoryMemory,
19
+ MemoryRecord,
20
+ ScoreBasedContextCreator,
21
+ )
22
+ from camel.toolkits.base import BaseToolkit
23
+ from camel.toolkits.function_tool import FunctionTool
24
+
25
+ if TYPE_CHECKING:
26
+ from camel.agents import ChatAgent
27
+
28
+
29
+ class MemoryToolkit(BaseToolkit):
30
+ r"""A toolkit that provides methods for saving, loading, and clearing a
31
+ ChatAgent's memory.
32
+ These methods are exposed as FunctionTool objects for
33
+ function calling. Internally, it calls:
34
+ - agent.save_memory(path)
35
+ - agent.load_memory(new_memory_obj)
36
+ - agent.load_memory_from_path(path)
37
+ - agent.clear_memory()
38
+
39
+ Args:
40
+ agent (ChatAgent): The chat agent whose memory will be managed.
41
+ timeout (Optional[float], optional): Maximum execution time allowed for
42
+ toolkit operations in seconds. If None, no timeout is applied.
43
+ (default: :obj:`None`)
44
+ """
45
+
46
+ def __init__(self, agent: 'ChatAgent', timeout: Optional[float] = None):
47
+ super().__init__(timeout=timeout)
48
+ self.agent = agent
49
+
50
+ def save(self, path: str) -> str:
51
+ r"""Saves the agent's current memory to a JSON file.
52
+
53
+ Args:
54
+ path (str): The file path to save the memory to.
55
+
56
+ Returns:
57
+ str: Confirmation message.
58
+ """
59
+ self.agent.save_memory(path)
60
+ return f"Memory saved to {path}"
61
+
62
+ def load(self, memory_json: str) -> str:
63
+ r"""Loads memory into the agent from a JSON string.
64
+
65
+ Args:
66
+ memory_json (str): A JSON string containing memory records.
67
+
68
+ Returns:
69
+ str: Confirmation or error message.
70
+ """
71
+ try:
72
+ data = json.loads(memory_json.strip())
73
+ if not isinstance(data, list):
74
+ return "[ERROR] Memory data should be a list of records."
75
+
76
+ # Build a fresh ChatHistoryMemory
77
+ context_creator = ScoreBasedContextCreator(
78
+ token_counter=self.agent.model_backend.token_counter,
79
+ token_limit=self.agent.model_backend.token_limit,
80
+ )
81
+ new_memory = ChatHistoryMemory(context_creator)
82
+
83
+ # Convert each record dict -> MemoryRecord
84
+ for record_dict in data:
85
+ record = MemoryRecord.from_dict(record_dict)
86
+ new_memory.write_record(record)
87
+
88
+ # Load into the agent
89
+ self.agent.load_memory(new_memory)
90
+ return "Loaded memory from provided JSON string."
91
+ except json.JSONDecodeError:
92
+ return "[ERROR] Invalid JSON string provided."
93
+ except Exception as e:
94
+ return f"[ERROR] Failed to load memory: {e!s}"
95
+
96
+ def load_from_path(self, path: str) -> str:
97
+ r"""Loads the agent's memory from a JSON file.
98
+
99
+ Args:
100
+ path (str): The file path to load the memory from.
101
+
102
+ Returns:
103
+ str: Confirmation message.
104
+ """
105
+ self.agent.load_memory_from_path(path)
106
+ return f"Memory loaded from {path}"
107
+
108
+ def clear_memory(self) -> str:
109
+ r"""Clears the agent's memory.
110
+
111
+ Returns:
112
+ str: Confirmation message.
113
+ """
114
+ self.agent.clear_memory()
115
+ return "Memory has been cleared."
116
+
117
+ def get_tools(self) -> list[FunctionTool]:
118
+ r"""Expose the memory management methods as function tools
119
+ for the ChatAgent.
120
+
121
+ Returns:
122
+ list[FunctionTool]: List of FunctionTool objects.
123
+ """
124
+ return [
125
+ FunctionTool(self.save),
126
+ FunctionTool(self.load),
127
+ FunctionTool(self.load_from_path),
128
+ FunctionTool(self.clear_memory),
129
+ ]
camel/types/enums.py CHANGED
@@ -63,6 +63,11 @@ class ModelType(UnifiedModelType, Enum):
63
63
  GROQ_MIXTRAL_8_7B = "mixtral-8x7b-32768"
64
64
  GROQ_GEMMA_2_9B_IT = "gemma2-9b-it"
65
65
 
66
+ # OpenRouter models
67
+ OPENROUTER_LLAMA_3_1_405B = "meta-llama/llama-3.3-405b-instruct"
68
+ OPENROUTER_LLAMA_3_1_70B = "meta-llama/llama-3.3-70b-instruct"
69
+ OPENROUTER_OLYMPICODER_7B = "open-r1/olympiccoder-7b:free"
70
+
66
71
  # TogetherAI platform models support tool calling
67
72
  TOGETHER_LLAMA_3_1_8B = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
68
73
  TOGETHER_LLAMA_3_1_70B = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
@@ -253,6 +258,7 @@ class ModelType(UnifiedModelType, Enum):
253
258
  self.is_together,
254
259
  self.is_sambanova,
255
260
  self.is_groq,
261
+ self.is_openrouter,
256
262
  self.is_sglang,
257
263
  self.is_moonshot,
258
264
  self.is_siliconflow,
@@ -342,6 +348,15 @@ class ModelType(UnifiedModelType, Enum):
342
348
  ModelType.GROQ_GEMMA_2_9B_IT,
343
349
  }
344
350
 
351
+ @property
352
+ def is_openrouter(self) -> bool:
353
+ r"""Returns whether this type of models is served by OpenRouter."""
354
+ return self in {
355
+ ModelType.OPENROUTER_LLAMA_3_1_405B,
356
+ ModelType.OPENROUTER_LLAMA_3_1_70B,
357
+ ModelType.OPENROUTER_OLYMPICODER_7B,
358
+ }
359
+
345
360
  @property
346
361
  def is_together(self) -> bool:
347
362
  r"""Returns whether this type of models is served by Together AI."""
@@ -580,6 +595,7 @@ class ModelType(UnifiedModelType, Enum):
580
595
  ModelType.MOONSHOT_V1_8K,
581
596
  ModelType.GLM_4V_FLASH,
582
597
  ModelType.GLM_4_AIRX,
598
+ ModelType.OPENROUTER_OLYMPICODER_7B,
583
599
  }:
584
600
  return 8_192
585
601
  elif self in {
@@ -686,6 +702,8 @@ class ModelType(UnifiedModelType, Enum):
686
702
  ModelType.SGLANG_QWEN_2_5_7B,
687
703
  ModelType.SGLANG_QWEN_2_5_32B,
688
704
  ModelType.SGLANG_QWEN_2_5_72B,
705
+ ModelType.OPENROUTER_LLAMA_3_1_70B,
706
+ ModelType.OPENROUTER_LLAMA_3_1_405B,
689
707
  }:
690
708
  return 131_072
691
709
  elif self in {
@@ -881,6 +899,7 @@ class ModelPlatformType(Enum):
881
899
  AZURE = "azure"
882
900
  ANTHROPIC = "anthropic"
883
901
  GROQ = "groq"
902
+ OPENROUTER = "openrouter"
884
903
  OLLAMA = "ollama"
885
904
  LITELLM = "litellm"
886
905
  ZHIPU = "zhipuai"
@@ -931,6 +950,11 @@ class ModelPlatformType(Enum):
931
950
  r"""Returns whether this platform is groq."""
932
951
  return self is ModelPlatformType.GROQ
933
952
 
953
+ @property
954
+ def is_openrouter(self) -> bool:
955
+ r"""Returns whether this platform is openrouter."""
956
+ return self is ModelPlatformType.OPENROUTER
957
+
934
958
  @property
935
959
  def is_ollama(self) -> bool:
936
960
  r"""Returns whether this platform is ollama."""
@@ -78,6 +78,11 @@ class UnifiedModelType(str):
78
78
  r"""Returns whether the model is a Groq served model."""
79
79
  return True
80
80
 
81
+ @property
82
+ def is_openrouter(self) -> bool:
83
+ r"""Returns whether the model is a OpenRouter served model."""
84
+ return True
85
+
81
86
  @property
82
87
  def is_zhipuai(self) -> bool:
83
88
  r"""Returns whether the model is a Zhipuai model."""
@@ -0,0 +1,22 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ from .base import BaseChunker
15
+ from .code_chunker import CodeChunker
16
+ from .uio_chunker import UnstructuredIOChunker
17
+
18
+ __all__ = [
19
+ "BaseChunker",
20
+ "CodeChunker",
21
+ "UnstructuredIOChunker",
22
+ ]
@@ -0,0 +1,24 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ from abc import ABC, abstractmethod
15
+ from typing import Any
16
+
17
+
18
+ class BaseChunker(ABC):
19
+ r"""An abstract base class for all CAMEL chunkers."""
20
+
21
+ @abstractmethod
22
+ def chunk(self, content: Any) -> Any:
23
+ r"""Chunk the given content"""
24
+ pass
@@ -0,0 +1,193 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ import re
15
+ from typing import List, Optional
16
+
17
+ from unstructured.documents.elements import Element, ElementMetadata
18
+
19
+ from camel.messages import OpenAIUserMessage
20
+ from camel.types import ModelType
21
+ from camel.utils import BaseTokenCounter, OpenAITokenCounter
22
+
23
+ from .base import BaseChunker
24
+
25
+
26
+ class CodeChunker(BaseChunker):
27
+ r"""A class for chunking code or text while respecting structure
28
+ and token limits.
29
+
30
+ This class ensures that structured elements such as functions,
31
+ classes, and regions are not arbitrarily split across chunks.
32
+ It also handles oversized lines and Base64-encoded images.
33
+
34
+ Attributes:
35
+ chunk_size (int, optional): The maximum token size per chunk.
36
+ (default: :obj:`8192`)
37
+ token_counter (BaseTokenCounter, optional): The tokenizer used for
38
+ token counting, if `None`, OpenAITokenCounter will be used.
39
+ (default: :obj:`None`)
40
+ remove_image: (bool, optional): If the chunker should skip the images.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ chunk_size: int = 8192,
46
+ token_counter: Optional[BaseTokenCounter] = None,
47
+ remove_image: Optional[bool] = True,
48
+ ):
49
+ self.chunk_size = chunk_size
50
+ self.token_counter = (
51
+ token_counter
52
+ if token_counter
53
+ else OpenAITokenCounter(model=ModelType.GPT_4O_MINI)
54
+ )
55
+ self.remove_image = remove_image
56
+ self.struct_pattern = re.compile(
57
+ r'^\s*(?:(def|class|function)\s+\w+|'
58
+ r'(public|private|protected)\s+[\w<>]+\s+\w+\s*\(|'
59
+ r'\b(interface|enum|namespace)\s+\w+|'
60
+ r'#\s*(region|endregion)\b)'
61
+ )
62
+ self.image_pattern = re.compile(
63
+ r'!\[.*?\]\((?:data:image/[^;]+;base64,[a-zA-Z0-9+/]+=*|[^)]+)\)'
64
+ )
65
+
66
+ def count_tokens(self, text: str):
67
+ r"""Counts the number of tokens in the given text.
68
+
69
+ Args:
70
+ text (str): The input text to be tokenized.
71
+
72
+ Returns:
73
+ int: The number of tokens in the input text.
74
+ """
75
+ return self.token_counter.count_tokens_from_messages(
76
+ [OpenAIUserMessage(role="user", name="user", content=text)]
77
+ )
78
+
79
+ def _split_oversized(self, line: str) -> List[str]:
80
+ r"""Splits an oversized line into multiple chunks based on token limits
81
+
82
+ Args:
83
+ line (str): The oversized line to be split.
84
+
85
+ Returns:
86
+ List[str]: A list of smaller chunks after splitting the
87
+ oversized line.
88
+ """
89
+ tokens = self.token_counter.encode(line)
90
+ chunks = []
91
+ buffer = []
92
+ current_count = 0
93
+
94
+ for token in tokens:
95
+ buffer.append(token)
96
+ current_count += 1
97
+
98
+ if current_count >= self.chunk_size:
99
+ chunks.append(self.token_counter.decode(buffer).strip())
100
+ buffer = []
101
+ current_count = 0
102
+
103
+ if buffer:
104
+ chunks.append(self.token_counter.decode(buffer))
105
+ return chunks
106
+
107
+ def chunk(self, content: List[str]) -> List[Element]:
108
+ r"""Splits the content into smaller chunks while preserving
109
+ structure and adhering to token constraints.
110
+
111
+ Args:
112
+ content (List[str]): The content to be chunked.
113
+
114
+ Returns:
115
+ List[str]: A list of chunked text segments.
116
+ """
117
+ content_str = "\n".join(map(str, content))
118
+ chunks = []
119
+ current_chunk: list[str] = []
120
+ current_tokens = 0
121
+ struct_buffer: list[str] = []
122
+ struct_tokens = 0
123
+
124
+ for line in content_str.splitlines(keepends=True):
125
+ if self.remove_image:
126
+ if self.image_pattern.match(line):
127
+ continue
128
+
129
+ line_tokens = self.count_tokens(line)
130
+
131
+ if line_tokens > self.chunk_size:
132
+ if current_chunk:
133
+ chunks.append("".join(current_chunk))
134
+ current_chunk = []
135
+ current_tokens = 0
136
+ chunks.extend(self._split_oversized(line))
137
+ continue
138
+
139
+ if self.struct_pattern.match(line):
140
+ if struct_buffer:
141
+ if current_tokens + struct_tokens <= self.chunk_size:
142
+ current_chunk.extend(struct_buffer)
143
+ current_tokens += struct_tokens
144
+ else:
145
+ if current_chunk:
146
+ chunks.append("".join(current_chunk))
147
+ current_chunk = struct_buffer.copy()
148
+ current_tokens = struct_tokens
149
+ struct_buffer = []
150
+ struct_tokens = 0
151
+
152
+ struct_buffer.append(line)
153
+ struct_tokens += line_tokens
154
+ else:
155
+ if struct_buffer:
156
+ struct_buffer.append(line)
157
+ struct_tokens += line_tokens
158
+ else:
159
+ if current_tokens + line_tokens > self.chunk_size:
160
+ chunks.append("".join(current_chunk))
161
+ current_chunk = [line]
162
+ current_tokens = line_tokens
163
+ else:
164
+ current_chunk.append(line)
165
+ current_tokens += line_tokens
166
+
167
+ if struct_buffer:
168
+ if current_tokens + struct_tokens <= self.chunk_size:
169
+ current_chunk.extend(struct_buffer)
170
+ else:
171
+ if current_chunk:
172
+ chunks.append("".join(current_chunk))
173
+ current_chunk = struct_buffer
174
+
175
+ if current_chunk:
176
+ chunks.append("".join(current_chunk))
177
+
178
+ final_chunks = []
179
+ for chunk in chunks:
180
+ chunk_token = self.count_tokens(chunk)
181
+ if chunk_token > self.chunk_size:
182
+ final_chunks.extend(self._split_oversized(chunk))
183
+ else:
184
+ final_chunks.append(chunk)
185
+
186
+ # TODO: need to reconsider how to correctly form metadata (maybe need
187
+ # to decouple the connection with unstructuredIO)
188
+ chunked_elements = []
189
+ for chunk in final_chunks:
190
+ element = Element(metadata=ElementMetadata())
191
+ element.text = chunk
192
+ chunked_elements.append(element)
193
+ return chunked_elements
@@ -0,0 +1,66 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ from typing import List, Optional
15
+
16
+ from unstructured.documents.elements import Element
17
+
18
+ from camel.loaders import UnstructuredIO
19
+ from camel.utils.chunker import BaseChunker
20
+
21
+
22
+ class UnstructuredIOChunker(BaseChunker):
23
+ r"""A class for chunking text while respecting structure and
24
+ character limits.
25
+
26
+ This class ensures that structured elements, such as document sections
27
+ and titles, are not arbitrarily split across chunks. It utilizes the
28
+ `UnstructuredIO` class to process and segment elements while maintaining
29
+ readability and coherence. The chunking method can be adjusted based on
30
+ the provided `chunk_type` parameter.
31
+
32
+ Args:
33
+ chunk_type (str, optional): The method used for chunking text.
34
+ (default: :obj:`"chunk_by_title"`)
35
+ max_characters (int, optional): The maximum number of characters
36
+ allowed per chunk. (default: :obj:`500`)
37
+ metadata_filename (Optional[str], optional): An optional filename
38
+ for storing metadata related to chunking. (default: :obj:`None`)
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ chunk_type: str = "chunk_by_title",
44
+ max_characters: int = 500,
45
+ metadata_filename: Optional[str] = None,
46
+ ):
47
+ self.uio = UnstructuredIO()
48
+ self.chunk_type = chunk_type
49
+ self.max_characters = max_characters
50
+ self.metadata_filename = metadata_filename
51
+
52
+ def chunk(self, content: List[Element]) -> List[Element]:
53
+ r"""Splits the content into smaller chunks while preserving
54
+ structure and adhering to token constraints.
55
+
56
+ Args:
57
+ content (List[Element]): The content to be chunked.
58
+
59
+ Returns:
60
+ List[Element]: A list of chunked text segments.
61
+ """
62
+ return self.uio.chunk_elements(
63
+ chunk_type=self.chunk_type,
64
+ elements=content,
65
+ max_characters=self.max_characters,
66
+ )