camel-ai 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/_types.py +1 -1
- camel/agents/_utils.py +4 -4
- camel/agents/chat_agent.py +174 -29
- camel/configs/__init__.py +3 -0
- camel/configs/openai_config.py +20 -16
- camel/configs/openrouter_config.py +106 -0
- camel/datasets/base_generator.py +188 -27
- camel/datasets/few_shot_generator.py +2 -5
- camel/environments/single_step.py +1 -7
- camel/memories/agent_memories.py +49 -2
- camel/memories/base.py +23 -1
- camel/memories/blocks/chat_history_block.py +2 -1
- camel/memories/records.py +5 -0
- camel/models/__init__.py +2 -0
- camel/models/gemini_model.py +36 -0
- camel/models/groq_model.py +6 -3
- camel/models/model_factory.py +3 -0
- camel/models/openrouter_model.py +204 -0
- camel/models/stub_model.py +25 -0
- camel/retrievers/vector_retriever.py +12 -7
- camel/storages/__init__.py +2 -0
- camel/storages/key_value_storages/__init__.py +4 -1
- camel/storages/key_value_storages/json.py +3 -7
- camel/storages/key_value_storages/mem0_cloud.py +224 -0
- camel/storages/vectordb_storages/base.py +5 -1
- camel/storages/vectordb_storages/qdrant.py +3 -3
- camel/toolkits/__init__.py +2 -1
- camel/toolkits/browser_toolkit.py +43 -0
- camel/toolkits/code_execution.py +2 -1
- camel/toolkits/mcp_toolkit.py +30 -1
- camel/toolkits/memory_toolkit.py +129 -0
- camel/types/enums.py +24 -0
- camel/types/unified_model_type.py +5 -0
- camel/utils/chunker/__init__.py +22 -0
- camel/utils/chunker/base.py +24 -0
- camel/utils/chunker/code_chunker.py +193 -0
- camel/utils/chunker/uio_chunker.py +66 -0
- camel/utils/token_counting.py +133 -0
- camel/verifiers/__init__.py +1 -2
- camel/verifiers/base.py +133 -96
- camel/verifiers/models.py +0 -12
- camel/verifiers/python_verifier.py +25 -14
- {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/METADATA +3 -1
- {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/RECORD +47 -39
- {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.34.dist-info → camel_ai-0.2.36.dist-info}/licenses/LICENSE +0 -0
|
@@ -438,6 +438,7 @@ class BaseBrowser:
|
|
|
438
438
|
sync_playwright,
|
|
439
439
|
)
|
|
440
440
|
|
|
441
|
+
self._ensure_browser_installed()
|
|
441
442
|
self.history: list = []
|
|
442
443
|
self.headless = headless
|
|
443
444
|
self.playwright = sync_playwright().start()
|
|
@@ -914,6 +915,48 @@ class BaseBrowser:
|
|
|
914
915
|
markdown_content = html2text(html_content)
|
|
915
916
|
return markdown_content
|
|
916
917
|
|
|
918
|
+
def _ensure_browser_installed(self) -> None:
|
|
919
|
+
r"""Ensure the browser is installed."""
|
|
920
|
+
import platform
|
|
921
|
+
import subprocess
|
|
922
|
+
import sys
|
|
923
|
+
|
|
924
|
+
try:
|
|
925
|
+
from playwright.sync_api import sync_playwright
|
|
926
|
+
|
|
927
|
+
with sync_playwright() as p:
|
|
928
|
+
browser = p.chromium.launch()
|
|
929
|
+
browser.close()
|
|
930
|
+
except Exception:
|
|
931
|
+
logger.info("Installing Chromium browser...")
|
|
932
|
+
try:
|
|
933
|
+
subprocess.run(
|
|
934
|
+
[
|
|
935
|
+
sys.executable,
|
|
936
|
+
"-m",
|
|
937
|
+
"playwright",
|
|
938
|
+
"install",
|
|
939
|
+
"chromium",
|
|
940
|
+
],
|
|
941
|
+
check=True,
|
|
942
|
+
capture_output=True,
|
|
943
|
+
)
|
|
944
|
+
if platform.system().lower() == "linux":
|
|
945
|
+
subprocess.run(
|
|
946
|
+
[
|
|
947
|
+
sys.executable,
|
|
948
|
+
"-m",
|
|
949
|
+
"playwright",
|
|
950
|
+
"install-deps",
|
|
951
|
+
"chromium",
|
|
952
|
+
],
|
|
953
|
+
check=True,
|
|
954
|
+
capture_output=True,
|
|
955
|
+
)
|
|
956
|
+
logger.info("Chromium browser installation completed")
|
|
957
|
+
except subprocess.CalledProcessError as e:
|
|
958
|
+
raise RuntimeError(f"Failed to install browser: {e.stderr}")
|
|
959
|
+
|
|
917
960
|
|
|
918
961
|
class BrowserToolkit(BaseToolkit):
|
|
919
962
|
r"""A class for browsing the web and interacting with web pages.
|
camel/toolkits/code_execution.py
CHANGED
|
@@ -29,6 +29,7 @@ class CodeExecutionToolkit(BaseToolkit):
|
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
31
|
sandbox (str): The environment type used to execute code.
|
|
32
|
+
(default: `subprocess`)
|
|
32
33
|
verbose (bool): Whether to print the output of the code execution.
|
|
33
34
|
(default: :obj:`False`)
|
|
34
35
|
unsafe_mode (bool): If `True`, the interpreter runs the code
|
|
@@ -43,7 +44,7 @@ class CodeExecutionToolkit(BaseToolkit):
|
|
|
43
44
|
self,
|
|
44
45
|
sandbox: Literal[
|
|
45
46
|
"internal_python", "jupyter", "docker", "subprocess", "e2b"
|
|
46
|
-
] = "
|
|
47
|
+
] = "subprocess",
|
|
47
48
|
verbose: bool = False,
|
|
48
49
|
unsafe_mode: bool = False,
|
|
49
50
|
import_white_list: Optional[List[str]] = None,
|
camel/toolkits/mcp_toolkit.py
CHANGED
|
@@ -56,6 +56,8 @@ class _MCPServer(BaseToolkit):
|
|
|
56
56
|
env (Dict[str, str]): Environment variables for the stdio mode command.
|
|
57
57
|
(default: :obj:`'None'`)
|
|
58
58
|
timeout (Optional[float]): Connection timeout. (default: :obj:`'None'`)
|
|
59
|
+
headers (Dict[str, str]): Headers for the HTTP request.
|
|
60
|
+
(default: :obj:`'None'`)
|
|
59
61
|
"""
|
|
60
62
|
|
|
61
63
|
def __init__(
|
|
@@ -64,6 +66,7 @@ class _MCPServer(BaseToolkit):
|
|
|
64
66
|
args: Optional[List[str]] = None,
|
|
65
67
|
env: Optional[Dict[str, str]] = None,
|
|
66
68
|
timeout: Optional[float] = None,
|
|
69
|
+
headers: Optional[Dict[str, str]] = None,
|
|
67
70
|
):
|
|
68
71
|
from mcp import Tool
|
|
69
72
|
from mcp.client.session import ClientSession
|
|
@@ -73,6 +76,7 @@ class _MCPServer(BaseToolkit):
|
|
|
73
76
|
self.command_or_url = command_or_url
|
|
74
77
|
self.args = args or []
|
|
75
78
|
self.env = env or {}
|
|
79
|
+
self.headers = headers or {}
|
|
76
80
|
|
|
77
81
|
self._mcp_tools: List[Tool] = []
|
|
78
82
|
self._session: Optional['ClientSession'] = None
|
|
@@ -99,7 +103,10 @@ class _MCPServer(BaseToolkit):
|
|
|
99
103
|
read_stream,
|
|
100
104
|
write_stream,
|
|
101
105
|
) = await self._exit_stack.enter_async_context(
|
|
102
|
-
sse_client(
|
|
106
|
+
sse_client(
|
|
107
|
+
self.command_or_url,
|
|
108
|
+
headers=self.headers,
|
|
109
|
+
)
|
|
103
110
|
)
|
|
104
111
|
else:
|
|
105
112
|
command = self.command_or_url
|
|
@@ -343,6 +350,27 @@ class MCPToolkit(BaseToolkit):
|
|
|
343
350
|
Either `servers` or `config_path` must be provided. If both are
|
|
344
351
|
provided, servers from both sources will be combined.
|
|
345
352
|
|
|
353
|
+
For web servers in the config file, you can specify authorization
|
|
354
|
+
headers using the "headers" field to connect to protected MCP server
|
|
355
|
+
endpoints.
|
|
356
|
+
|
|
357
|
+
Example configuration:
|
|
358
|
+
|
|
359
|
+
.. code-block:: json
|
|
360
|
+
|
|
361
|
+
{
|
|
362
|
+
"mcpWebServers": {
|
|
363
|
+
"protected-server": {
|
|
364
|
+
"url": "https://example.com/mcp",
|
|
365
|
+
"timeout": 30,
|
|
366
|
+
"headers": {
|
|
367
|
+
"Authorization": "Bearer YOUR_TOKEN",
|
|
368
|
+
"X-API-Key": "YOUR_API_KEY"
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
346
374
|
Attributes:
|
|
347
375
|
servers (List[_MCPServer]): List of _MCPServer instances being managed.
|
|
348
376
|
"""
|
|
@@ -442,6 +470,7 @@ class MCPToolkit(BaseToolkit):
|
|
|
442
470
|
server = _MCPServer(
|
|
443
471
|
command_or_url=cfg["url"],
|
|
444
472
|
timeout=cfg.get("timeout", None),
|
|
473
|
+
headers=cfg.get("headers", {}),
|
|
445
474
|
)
|
|
446
475
|
all_servers.append(server)
|
|
447
476
|
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
import json
|
|
15
|
+
from typing import TYPE_CHECKING, Optional
|
|
16
|
+
|
|
17
|
+
from camel.memories import (
|
|
18
|
+
ChatHistoryMemory,
|
|
19
|
+
MemoryRecord,
|
|
20
|
+
ScoreBasedContextCreator,
|
|
21
|
+
)
|
|
22
|
+
from camel.toolkits.base import BaseToolkit
|
|
23
|
+
from camel.toolkits.function_tool import FunctionTool
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from camel.agents import ChatAgent
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MemoryToolkit(BaseToolkit):
|
|
30
|
+
r"""A toolkit that provides methods for saving, loading, and clearing a
|
|
31
|
+
ChatAgent's memory.
|
|
32
|
+
These methods are exposed as FunctionTool objects for
|
|
33
|
+
function calling. Internally, it calls:
|
|
34
|
+
- agent.save_memory(path)
|
|
35
|
+
- agent.load_memory(new_memory_obj)
|
|
36
|
+
- agent.load_memory_from_path(path)
|
|
37
|
+
- agent.clear_memory()
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
agent (ChatAgent): The chat agent whose memory will be managed.
|
|
41
|
+
timeout (Optional[float], optional): Maximum execution time allowed for
|
|
42
|
+
toolkit operations in seconds. If None, no timeout is applied.
|
|
43
|
+
(default: :obj:`None`)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, agent: 'ChatAgent', timeout: Optional[float] = None):
|
|
47
|
+
super().__init__(timeout=timeout)
|
|
48
|
+
self.agent = agent
|
|
49
|
+
|
|
50
|
+
def save(self, path: str) -> str:
|
|
51
|
+
r"""Saves the agent's current memory to a JSON file.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
path (str): The file path to save the memory to.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
str: Confirmation message.
|
|
58
|
+
"""
|
|
59
|
+
self.agent.save_memory(path)
|
|
60
|
+
return f"Memory saved to {path}"
|
|
61
|
+
|
|
62
|
+
def load(self, memory_json: str) -> str:
|
|
63
|
+
r"""Loads memory into the agent from a JSON string.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
memory_json (str): A JSON string containing memory records.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
str: Confirmation or error message.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
data = json.loads(memory_json.strip())
|
|
73
|
+
if not isinstance(data, list):
|
|
74
|
+
return "[ERROR] Memory data should be a list of records."
|
|
75
|
+
|
|
76
|
+
# Build a fresh ChatHistoryMemory
|
|
77
|
+
context_creator = ScoreBasedContextCreator(
|
|
78
|
+
token_counter=self.agent.model_backend.token_counter,
|
|
79
|
+
token_limit=self.agent.model_backend.token_limit,
|
|
80
|
+
)
|
|
81
|
+
new_memory = ChatHistoryMemory(context_creator)
|
|
82
|
+
|
|
83
|
+
# Convert each record dict -> MemoryRecord
|
|
84
|
+
for record_dict in data:
|
|
85
|
+
record = MemoryRecord.from_dict(record_dict)
|
|
86
|
+
new_memory.write_record(record)
|
|
87
|
+
|
|
88
|
+
# Load into the agent
|
|
89
|
+
self.agent.load_memory(new_memory)
|
|
90
|
+
return "Loaded memory from provided JSON string."
|
|
91
|
+
except json.JSONDecodeError:
|
|
92
|
+
return "[ERROR] Invalid JSON string provided."
|
|
93
|
+
except Exception as e:
|
|
94
|
+
return f"[ERROR] Failed to load memory: {e!s}"
|
|
95
|
+
|
|
96
|
+
def load_from_path(self, path: str) -> str:
|
|
97
|
+
r"""Loads the agent's memory from a JSON file.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
path (str): The file path to load the memory from.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str: Confirmation message.
|
|
104
|
+
"""
|
|
105
|
+
self.agent.load_memory_from_path(path)
|
|
106
|
+
return f"Memory loaded from {path}"
|
|
107
|
+
|
|
108
|
+
def clear_memory(self) -> str:
|
|
109
|
+
r"""Clears the agent's memory.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
str: Confirmation message.
|
|
113
|
+
"""
|
|
114
|
+
self.agent.clear_memory()
|
|
115
|
+
return "Memory has been cleared."
|
|
116
|
+
|
|
117
|
+
def get_tools(self) -> list[FunctionTool]:
|
|
118
|
+
r"""Expose the memory management methods as function tools
|
|
119
|
+
for the ChatAgent.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
list[FunctionTool]: List of FunctionTool objects.
|
|
123
|
+
"""
|
|
124
|
+
return [
|
|
125
|
+
FunctionTool(self.save),
|
|
126
|
+
FunctionTool(self.load),
|
|
127
|
+
FunctionTool(self.load_from_path),
|
|
128
|
+
FunctionTool(self.clear_memory),
|
|
129
|
+
]
|
camel/types/enums.py
CHANGED
|
@@ -63,6 +63,11 @@ class ModelType(UnifiedModelType, Enum):
|
|
|
63
63
|
GROQ_MIXTRAL_8_7B = "mixtral-8x7b-32768"
|
|
64
64
|
GROQ_GEMMA_2_9B_IT = "gemma2-9b-it"
|
|
65
65
|
|
|
66
|
+
# OpenRouter models
|
|
67
|
+
OPENROUTER_LLAMA_3_1_405B = "meta-llama/llama-3.3-405b-instruct"
|
|
68
|
+
OPENROUTER_LLAMA_3_1_70B = "meta-llama/llama-3.3-70b-instruct"
|
|
69
|
+
OPENROUTER_OLYMPICODER_7B = "open-r1/olympiccoder-7b:free"
|
|
70
|
+
|
|
66
71
|
# TogetherAI platform models support tool calling
|
|
67
72
|
TOGETHER_LLAMA_3_1_8B = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
|
68
73
|
TOGETHER_LLAMA_3_1_70B = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
|
|
@@ -253,6 +258,7 @@ class ModelType(UnifiedModelType, Enum):
|
|
|
253
258
|
self.is_together,
|
|
254
259
|
self.is_sambanova,
|
|
255
260
|
self.is_groq,
|
|
261
|
+
self.is_openrouter,
|
|
256
262
|
self.is_sglang,
|
|
257
263
|
self.is_moonshot,
|
|
258
264
|
self.is_siliconflow,
|
|
@@ -342,6 +348,15 @@ class ModelType(UnifiedModelType, Enum):
|
|
|
342
348
|
ModelType.GROQ_GEMMA_2_9B_IT,
|
|
343
349
|
}
|
|
344
350
|
|
|
351
|
+
@property
|
|
352
|
+
def is_openrouter(self) -> bool:
|
|
353
|
+
r"""Returns whether this type of models is served by OpenRouter."""
|
|
354
|
+
return self in {
|
|
355
|
+
ModelType.OPENROUTER_LLAMA_3_1_405B,
|
|
356
|
+
ModelType.OPENROUTER_LLAMA_3_1_70B,
|
|
357
|
+
ModelType.OPENROUTER_OLYMPICODER_7B,
|
|
358
|
+
}
|
|
359
|
+
|
|
345
360
|
@property
|
|
346
361
|
def is_together(self) -> bool:
|
|
347
362
|
r"""Returns whether this type of models is served by Together AI."""
|
|
@@ -580,6 +595,7 @@ class ModelType(UnifiedModelType, Enum):
|
|
|
580
595
|
ModelType.MOONSHOT_V1_8K,
|
|
581
596
|
ModelType.GLM_4V_FLASH,
|
|
582
597
|
ModelType.GLM_4_AIRX,
|
|
598
|
+
ModelType.OPENROUTER_OLYMPICODER_7B,
|
|
583
599
|
}:
|
|
584
600
|
return 8_192
|
|
585
601
|
elif self in {
|
|
@@ -686,6 +702,8 @@ class ModelType(UnifiedModelType, Enum):
|
|
|
686
702
|
ModelType.SGLANG_QWEN_2_5_7B,
|
|
687
703
|
ModelType.SGLANG_QWEN_2_5_32B,
|
|
688
704
|
ModelType.SGLANG_QWEN_2_5_72B,
|
|
705
|
+
ModelType.OPENROUTER_LLAMA_3_1_70B,
|
|
706
|
+
ModelType.OPENROUTER_LLAMA_3_1_405B,
|
|
689
707
|
}:
|
|
690
708
|
return 131_072
|
|
691
709
|
elif self in {
|
|
@@ -881,6 +899,7 @@ class ModelPlatformType(Enum):
|
|
|
881
899
|
AZURE = "azure"
|
|
882
900
|
ANTHROPIC = "anthropic"
|
|
883
901
|
GROQ = "groq"
|
|
902
|
+
OPENROUTER = "openrouter"
|
|
884
903
|
OLLAMA = "ollama"
|
|
885
904
|
LITELLM = "litellm"
|
|
886
905
|
ZHIPU = "zhipuai"
|
|
@@ -931,6 +950,11 @@ class ModelPlatformType(Enum):
|
|
|
931
950
|
r"""Returns whether this platform is groq."""
|
|
932
951
|
return self is ModelPlatformType.GROQ
|
|
933
952
|
|
|
953
|
+
@property
|
|
954
|
+
def is_openrouter(self) -> bool:
|
|
955
|
+
r"""Returns whether this platform is openrouter."""
|
|
956
|
+
return self is ModelPlatformType.OPENROUTER
|
|
957
|
+
|
|
934
958
|
@property
|
|
935
959
|
def is_ollama(self) -> bool:
|
|
936
960
|
r"""Returns whether this platform is ollama."""
|
|
@@ -78,6 +78,11 @@ class UnifiedModelType(str):
|
|
|
78
78
|
r"""Returns whether the model is a Groq served model."""
|
|
79
79
|
return True
|
|
80
80
|
|
|
81
|
+
@property
|
|
82
|
+
def is_openrouter(self) -> bool:
|
|
83
|
+
r"""Returns whether the model is a OpenRouter served model."""
|
|
84
|
+
return True
|
|
85
|
+
|
|
81
86
|
@property
|
|
82
87
|
def is_zhipuai(self) -> bool:
|
|
83
88
|
r"""Returns whether the model is a Zhipuai model."""
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from .base import BaseChunker
|
|
15
|
+
from .code_chunker import CodeChunker
|
|
16
|
+
from .uio_chunker import UnstructuredIOChunker
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"BaseChunker",
|
|
20
|
+
"CodeChunker",
|
|
21
|
+
"UnstructuredIOChunker",
|
|
22
|
+
]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from abc import ABC, abstractmethod
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BaseChunker(ABC):
|
|
19
|
+
r"""An abstract base class for all CAMEL chunkers."""
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def chunk(self, content: Any) -> Any:
|
|
23
|
+
r"""Chunk the given content"""
|
|
24
|
+
pass
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
import re
|
|
15
|
+
from typing import List, Optional
|
|
16
|
+
|
|
17
|
+
from unstructured.documents.elements import Element, ElementMetadata
|
|
18
|
+
|
|
19
|
+
from camel.messages import OpenAIUserMessage
|
|
20
|
+
from camel.types import ModelType
|
|
21
|
+
from camel.utils import BaseTokenCounter, OpenAITokenCounter
|
|
22
|
+
|
|
23
|
+
from .base import BaseChunker
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CodeChunker(BaseChunker):
|
|
27
|
+
r"""A class for chunking code or text while respecting structure
|
|
28
|
+
and token limits.
|
|
29
|
+
|
|
30
|
+
This class ensures that structured elements such as functions,
|
|
31
|
+
classes, and regions are not arbitrarily split across chunks.
|
|
32
|
+
It also handles oversized lines and Base64-encoded images.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
chunk_size (int, optional): The maximum token size per chunk.
|
|
36
|
+
(default: :obj:`8192`)
|
|
37
|
+
token_counter (BaseTokenCounter, optional): The tokenizer used for
|
|
38
|
+
token counting, if `None`, OpenAITokenCounter will be used.
|
|
39
|
+
(default: :obj:`None`)
|
|
40
|
+
remove_image: (bool, optional): If the chunker should skip the images.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
chunk_size: int = 8192,
|
|
46
|
+
token_counter: Optional[BaseTokenCounter] = None,
|
|
47
|
+
remove_image: Optional[bool] = True,
|
|
48
|
+
):
|
|
49
|
+
self.chunk_size = chunk_size
|
|
50
|
+
self.token_counter = (
|
|
51
|
+
token_counter
|
|
52
|
+
if token_counter
|
|
53
|
+
else OpenAITokenCounter(model=ModelType.GPT_4O_MINI)
|
|
54
|
+
)
|
|
55
|
+
self.remove_image = remove_image
|
|
56
|
+
self.struct_pattern = re.compile(
|
|
57
|
+
r'^\s*(?:(def|class|function)\s+\w+|'
|
|
58
|
+
r'(public|private|protected)\s+[\w<>]+\s+\w+\s*\(|'
|
|
59
|
+
r'\b(interface|enum|namespace)\s+\w+|'
|
|
60
|
+
r'#\s*(region|endregion)\b)'
|
|
61
|
+
)
|
|
62
|
+
self.image_pattern = re.compile(
|
|
63
|
+
r'!\[.*?\]\((?:data:image/[^;]+;base64,[a-zA-Z0-9+/]+=*|[^)]+)\)'
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def count_tokens(self, text: str):
|
|
67
|
+
r"""Counts the number of tokens in the given text.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
text (str): The input text to be tokenized.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
int: The number of tokens in the input text.
|
|
74
|
+
"""
|
|
75
|
+
return self.token_counter.count_tokens_from_messages(
|
|
76
|
+
[OpenAIUserMessage(role="user", name="user", content=text)]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _split_oversized(self, line: str) -> List[str]:
|
|
80
|
+
r"""Splits an oversized line into multiple chunks based on token limits
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
line (str): The oversized line to be split.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List[str]: A list of smaller chunks after splitting the
|
|
87
|
+
oversized line.
|
|
88
|
+
"""
|
|
89
|
+
tokens = self.token_counter.encode(line)
|
|
90
|
+
chunks = []
|
|
91
|
+
buffer = []
|
|
92
|
+
current_count = 0
|
|
93
|
+
|
|
94
|
+
for token in tokens:
|
|
95
|
+
buffer.append(token)
|
|
96
|
+
current_count += 1
|
|
97
|
+
|
|
98
|
+
if current_count >= self.chunk_size:
|
|
99
|
+
chunks.append(self.token_counter.decode(buffer).strip())
|
|
100
|
+
buffer = []
|
|
101
|
+
current_count = 0
|
|
102
|
+
|
|
103
|
+
if buffer:
|
|
104
|
+
chunks.append(self.token_counter.decode(buffer))
|
|
105
|
+
return chunks
|
|
106
|
+
|
|
107
|
+
def chunk(self, content: List[str]) -> List[Element]:
|
|
108
|
+
r"""Splits the content into smaller chunks while preserving
|
|
109
|
+
structure and adhering to token constraints.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
content (List[str]): The content to be chunked.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List[str]: A list of chunked text segments.
|
|
116
|
+
"""
|
|
117
|
+
content_str = "\n".join(map(str, content))
|
|
118
|
+
chunks = []
|
|
119
|
+
current_chunk: list[str] = []
|
|
120
|
+
current_tokens = 0
|
|
121
|
+
struct_buffer: list[str] = []
|
|
122
|
+
struct_tokens = 0
|
|
123
|
+
|
|
124
|
+
for line in content_str.splitlines(keepends=True):
|
|
125
|
+
if self.remove_image:
|
|
126
|
+
if self.image_pattern.match(line):
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
line_tokens = self.count_tokens(line)
|
|
130
|
+
|
|
131
|
+
if line_tokens > self.chunk_size:
|
|
132
|
+
if current_chunk:
|
|
133
|
+
chunks.append("".join(current_chunk))
|
|
134
|
+
current_chunk = []
|
|
135
|
+
current_tokens = 0
|
|
136
|
+
chunks.extend(self._split_oversized(line))
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
if self.struct_pattern.match(line):
|
|
140
|
+
if struct_buffer:
|
|
141
|
+
if current_tokens + struct_tokens <= self.chunk_size:
|
|
142
|
+
current_chunk.extend(struct_buffer)
|
|
143
|
+
current_tokens += struct_tokens
|
|
144
|
+
else:
|
|
145
|
+
if current_chunk:
|
|
146
|
+
chunks.append("".join(current_chunk))
|
|
147
|
+
current_chunk = struct_buffer.copy()
|
|
148
|
+
current_tokens = struct_tokens
|
|
149
|
+
struct_buffer = []
|
|
150
|
+
struct_tokens = 0
|
|
151
|
+
|
|
152
|
+
struct_buffer.append(line)
|
|
153
|
+
struct_tokens += line_tokens
|
|
154
|
+
else:
|
|
155
|
+
if struct_buffer:
|
|
156
|
+
struct_buffer.append(line)
|
|
157
|
+
struct_tokens += line_tokens
|
|
158
|
+
else:
|
|
159
|
+
if current_tokens + line_tokens > self.chunk_size:
|
|
160
|
+
chunks.append("".join(current_chunk))
|
|
161
|
+
current_chunk = [line]
|
|
162
|
+
current_tokens = line_tokens
|
|
163
|
+
else:
|
|
164
|
+
current_chunk.append(line)
|
|
165
|
+
current_tokens += line_tokens
|
|
166
|
+
|
|
167
|
+
if struct_buffer:
|
|
168
|
+
if current_tokens + struct_tokens <= self.chunk_size:
|
|
169
|
+
current_chunk.extend(struct_buffer)
|
|
170
|
+
else:
|
|
171
|
+
if current_chunk:
|
|
172
|
+
chunks.append("".join(current_chunk))
|
|
173
|
+
current_chunk = struct_buffer
|
|
174
|
+
|
|
175
|
+
if current_chunk:
|
|
176
|
+
chunks.append("".join(current_chunk))
|
|
177
|
+
|
|
178
|
+
final_chunks = []
|
|
179
|
+
for chunk in chunks:
|
|
180
|
+
chunk_token = self.count_tokens(chunk)
|
|
181
|
+
if chunk_token > self.chunk_size:
|
|
182
|
+
final_chunks.extend(self._split_oversized(chunk))
|
|
183
|
+
else:
|
|
184
|
+
final_chunks.append(chunk)
|
|
185
|
+
|
|
186
|
+
# TODO: need to reconsider how to correctly form metadata (maybe need
|
|
187
|
+
# to decouple the connection with unstructuredIO)
|
|
188
|
+
chunked_elements = []
|
|
189
|
+
for chunk in final_chunks:
|
|
190
|
+
element = Element(metadata=ElementMetadata())
|
|
191
|
+
element.text = chunk
|
|
192
|
+
chunked_elements.append(element)
|
|
193
|
+
return chunked_elements
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from typing import List, Optional
|
|
15
|
+
|
|
16
|
+
from unstructured.documents.elements import Element
|
|
17
|
+
|
|
18
|
+
from camel.loaders import UnstructuredIO
|
|
19
|
+
from camel.utils.chunker import BaseChunker
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UnstructuredIOChunker(BaseChunker):
|
|
23
|
+
r"""A class for chunking text while respecting structure and
|
|
24
|
+
character limits.
|
|
25
|
+
|
|
26
|
+
This class ensures that structured elements, such as document sections
|
|
27
|
+
and titles, are not arbitrarily split across chunks. It utilizes the
|
|
28
|
+
`UnstructuredIO` class to process and segment elements while maintaining
|
|
29
|
+
readability and coherence. The chunking method can be adjusted based on
|
|
30
|
+
the provided `chunk_type` parameter.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
chunk_type (str, optional): The method used for chunking text.
|
|
34
|
+
(default: :obj:`"chunk_by_title"`)
|
|
35
|
+
max_characters (int, optional): The maximum number of characters
|
|
36
|
+
allowed per chunk. (default: :obj:`500`)
|
|
37
|
+
metadata_filename (Optional[str], optional): An optional filename
|
|
38
|
+
for storing metadata related to chunking. (default: :obj:`None`)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
chunk_type: str = "chunk_by_title",
|
|
44
|
+
max_characters: int = 500,
|
|
45
|
+
metadata_filename: Optional[str] = None,
|
|
46
|
+
):
|
|
47
|
+
self.uio = UnstructuredIO()
|
|
48
|
+
self.chunk_type = chunk_type
|
|
49
|
+
self.max_characters = max_characters
|
|
50
|
+
self.metadata_filename = metadata_filename
|
|
51
|
+
|
|
52
|
+
def chunk(self, content: List[Element]) -> List[Element]:
|
|
53
|
+
r"""Splits the content into smaller chunks while preserving
|
|
54
|
+
structure and adhering to token constraints.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
content (List[Element]): The content to be chunked.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List[Element]: A list of chunked text segments.
|
|
61
|
+
"""
|
|
62
|
+
return self.uio.chunk_elements(
|
|
63
|
+
chunk_type=self.chunk_type,
|
|
64
|
+
elements=content,
|
|
65
|
+
max_characters=self.max_characters,
|
|
66
|
+
)
|