agno 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +47 -39
- agno/db/surrealdb/models.py +1 -1
- agno/knowledge/chunking/agentic.py +1 -5
- agno/knowledge/chunking/code.py +1 -1
- agno/knowledge/chunking/document.py +22 -42
- agno/knowledge/chunking/fixed.py +1 -5
- agno/knowledge/chunking/markdown.py +9 -25
- agno/knowledge/chunking/recursive.py +1 -3
- agno/knowledge/chunking/row.py +3 -2
- agno/knowledge/chunking/semantic.py +1 -1
- agno/knowledge/chunking/strategy.py +19 -0
- agno/knowledge/knowledge.py +181 -24
- agno/knowledge/reader/text_reader.py +1 -1
- agno/learn/stores/learned_knowledge.py +108 -131
- agno/team/team.py +27 -20
- agno/tools/seltz.py +134 -0
- agno/utils/print_response/agent.py +8 -8
- agno/utils/print_response/team.py +8 -8
- {agno-2.4.4.dist-info → agno-2.4.6.dist-info}/METADATA +36 -58
- {agno-2.4.4.dist-info → agno-2.4.6.dist-info}/RECORD +23 -22
- {agno-2.4.4.dist-info → agno-2.4.6.dist-info}/WHEEL +0 -0
- {agno-2.4.4.dist-info → agno-2.4.6.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.4.dist-info → agno-2.4.6.dist-info}/top_level.txt +0 -0
agno/agent/agent.py
CHANGED
|
@@ -320,6 +320,8 @@ class Agent:
|
|
|
320
320
|
# Add a tool that allows the Model to search the knowledge base (aka Agentic RAG)
|
|
321
321
|
# Added only if knowledge is provided.
|
|
322
322
|
search_knowledge: bool = True
|
|
323
|
+
# If True, add search_knowledge instructions to the system prompt
|
|
324
|
+
add_search_knowledge_instructions: bool = True
|
|
323
325
|
# Add a tool that allows the Agent to update Knowledge.
|
|
324
326
|
update_knowledge: bool = False
|
|
325
327
|
# Add a tool that allows the Model to get the tool call history.
|
|
@@ -524,6 +526,7 @@ class Agent:
|
|
|
524
526
|
reasoning_max_steps: int = 10,
|
|
525
527
|
read_chat_history: bool = False,
|
|
526
528
|
search_knowledge: bool = True,
|
|
529
|
+
add_search_knowledge_instructions: bool = True,
|
|
527
530
|
update_knowledge: bool = False,
|
|
528
531
|
read_tool_call_history: bool = False,
|
|
529
532
|
send_media_to_model: bool = True,
|
|
@@ -661,6 +664,7 @@ class Agent:
|
|
|
661
664
|
|
|
662
665
|
self.read_chat_history = read_chat_history
|
|
663
666
|
self.search_knowledge = search_knowledge
|
|
667
|
+
self.add_search_knowledge_instructions = add_search_knowledge_instructions
|
|
664
668
|
self.update_knowledge = update_knowledge
|
|
665
669
|
self.read_tool_call_history = read_tool_call_history
|
|
666
670
|
self.send_media_to_model = send_media_to_model
|
|
@@ -854,8 +858,9 @@ class Agent:
|
|
|
854
858
|
return
|
|
855
859
|
|
|
856
860
|
# Handle learning=True: create default LearningMachine
|
|
861
|
+
# Enables user_profile (structured fields) and user_memory (unstructured observations)
|
|
857
862
|
if self.learning is True:
|
|
858
|
-
self._learning = LearningMachine(db=self.db, model=self.model, user_profile=True)
|
|
863
|
+
self._learning = LearningMachine(db=self.db, model=self.model, user_profile=True, user_memory=True)
|
|
859
864
|
return
|
|
860
865
|
|
|
861
866
|
# Handle learning=LearningMachine(...): inject dependencies
|
|
@@ -7431,6 +7436,10 @@ class Agent:
|
|
|
7431
7436
|
config["enable_agentic_knowledge_filters"] = self.enable_agentic_knowledge_filters
|
|
7432
7437
|
if self.add_knowledge_to_context:
|
|
7433
7438
|
config["add_knowledge_to_context"] = self.add_knowledge_to_context
|
|
7439
|
+
if not self.search_knowledge:
|
|
7440
|
+
config["search_knowledge"] = self.search_knowledge
|
|
7441
|
+
if self.add_search_knowledge_instructions:
|
|
7442
|
+
config["add_search_knowledge_instructions"] = self.add_search_knowledge_instructions
|
|
7434
7443
|
# Skip knowledge_retriever as it's a callable
|
|
7435
7444
|
if self.references_format != "json":
|
|
7436
7445
|
config["references_format"] = self.references_format
|
|
@@ -7481,8 +7490,6 @@ class Agent:
|
|
|
7481
7490
|
# --- Default tools settings ---
|
|
7482
7491
|
if self.read_chat_history:
|
|
7483
7492
|
config["read_chat_history"] = self.read_chat_history
|
|
7484
|
-
if not self.search_knowledge:
|
|
7485
|
-
config["search_knowledge"] = self.search_knowledge
|
|
7486
7493
|
if self.update_knowledge:
|
|
7487
7494
|
config["update_knowledge"] = self.update_knowledge
|
|
7488
7495
|
if self.read_tool_call_history:
|
|
@@ -7806,6 +7813,7 @@ class Agent:
|
|
|
7806
7813
|
# --- Default tools settings ---
|
|
7807
7814
|
read_chat_history=config.get("read_chat_history", False),
|
|
7808
7815
|
search_knowledge=config.get("search_knowledge", True),
|
|
7816
|
+
add_search_knowledge_instructions=config.get("add_search_knowledge_instructions", True),
|
|
7809
7817
|
update_knowledge=config.get("update_knowledge", False),
|
|
7810
7818
|
read_tool_call_history=config.get("read_tool_call_history", False),
|
|
7811
7819
|
send_media_to_model=config.get("send_media_to_model", True),
|
|
@@ -8859,16 +8867,6 @@ class Agent:
|
|
|
8859
8867
|
if self.name is not None and self.add_name_to_context:
|
|
8860
8868
|
additional_information.append(f"Your name is: {self.name}.")
|
|
8861
8869
|
|
|
8862
|
-
# 3.2.5 Add knowledge context using protocol's build_context
|
|
8863
|
-
if self.knowledge is not None:
|
|
8864
|
-
build_context_fn = getattr(self.knowledge, "build_context", None)
|
|
8865
|
-
if callable(build_context_fn):
|
|
8866
|
-
knowledge_context = build_context_fn(
|
|
8867
|
-
enable_agentic_filters=self.enable_agentic_knowledge_filters,
|
|
8868
|
-
)
|
|
8869
|
-
if knowledge_context:
|
|
8870
|
-
additional_information.append(knowledge_context)
|
|
8871
|
-
|
|
8872
8870
|
# 3.3 Build the default system message for the Agent.
|
|
8873
8871
|
system_message_content: str = ""
|
|
8874
8872
|
# 3.3.1 First add the Agent description if provided
|
|
@@ -9045,12 +9043,22 @@ class Agent:
|
|
|
9045
9043
|
if learning_context:
|
|
9046
9044
|
system_message_content += learning_context + "\n"
|
|
9047
9045
|
|
|
9048
|
-
# 3.3.13
|
|
9046
|
+
# 3.3.13 then add search_knowledge instructions to the system prompt
|
|
9047
|
+
if self.knowledge is not None and self.search_knowledge and self.add_search_knowledge_instructions:
|
|
9048
|
+
build_context_fn = getattr(self.knowledge, "build_context", None)
|
|
9049
|
+
if callable(build_context_fn):
|
|
9050
|
+
knowledge_context = build_context_fn(
|
|
9051
|
+
enable_agentic_filters=self.enable_agentic_knowledge_filters,
|
|
9052
|
+
)
|
|
9053
|
+
if knowledge_context is not None:
|
|
9054
|
+
system_message_content += knowledge_context + "\n"
|
|
9055
|
+
|
|
9056
|
+
# 3.3.14 Add the system message from the Model
|
|
9049
9057
|
system_message_from_model = self.model.get_system_message_for_model(tools)
|
|
9050
9058
|
if system_message_from_model is not None:
|
|
9051
9059
|
system_message_content += system_message_from_model
|
|
9052
9060
|
|
|
9053
|
-
# 3.3.
|
|
9061
|
+
# 3.3.15 Add the JSON output prompt if output_schema is provided and the model does not support native structured outputs or JSON schema outputs
|
|
9054
9062
|
# or if use_json_mode is True
|
|
9055
9063
|
if (
|
|
9056
9064
|
output_schema is not None
|
|
@@ -9062,11 +9070,11 @@ class Agent:
|
|
|
9062
9070
|
):
|
|
9063
9071
|
system_message_content += f"{get_json_output_prompt(output_schema)}" # type: ignore
|
|
9064
9072
|
|
|
9065
|
-
# 3.3.
|
|
9073
|
+
# 3.3.16 Add the response model format prompt if output_schema is provided (Pydantic only)
|
|
9066
9074
|
if output_schema is not None and self.parser_model is not None and not isinstance(output_schema, dict):
|
|
9067
9075
|
system_message_content += f"{get_response_model_format_prompt(output_schema)}"
|
|
9068
9076
|
|
|
9069
|
-
# 3.3.
|
|
9077
|
+
# 3.3.17 Add the session state to the system message
|
|
9070
9078
|
if add_session_state_to_context and session_state is not None:
|
|
9071
9079
|
system_message_content += f"\n<session_state>\n{session_state}\n</session_state>\n\n"
|
|
9072
9080
|
|
|
@@ -9196,24 +9204,6 @@ class Agent:
|
|
|
9196
9204
|
if self.name is not None and self.add_name_to_context:
|
|
9197
9205
|
additional_information.append(f"Your name is: {self.name}.")
|
|
9198
9206
|
|
|
9199
|
-
# 3.2.5 Add knowledge context using protocol's build_context (async)
|
|
9200
|
-
if self.knowledge is not None:
|
|
9201
|
-
# Prefer async version if available for async databases
|
|
9202
|
-
abuild_context_fn = getattr(self.knowledge, "abuild_context", None)
|
|
9203
|
-
build_context_fn = getattr(self.knowledge, "build_context", None)
|
|
9204
|
-
if callable(abuild_context_fn):
|
|
9205
|
-
knowledge_context = await abuild_context_fn(
|
|
9206
|
-
enable_agentic_filters=self.enable_agentic_knowledge_filters,
|
|
9207
|
-
)
|
|
9208
|
-
if knowledge_context:
|
|
9209
|
-
additional_information.append(knowledge_context)
|
|
9210
|
-
elif callable(build_context_fn):
|
|
9211
|
-
knowledge_context = build_context_fn(
|
|
9212
|
-
enable_agentic_filters=self.enable_agentic_knowledge_filters,
|
|
9213
|
-
)
|
|
9214
|
-
if knowledge_context:
|
|
9215
|
-
additional_information.append(knowledge_context)
|
|
9216
|
-
|
|
9217
9207
|
# 3.3 Build the default system message for the Agent.
|
|
9218
9208
|
system_message_content: str = ""
|
|
9219
9209
|
# 3.3.1 First add the Agent description if provided
|
|
@@ -9393,12 +9383,30 @@ class Agent:
|
|
|
9393
9383
|
if learning_context:
|
|
9394
9384
|
system_message_content += learning_context + "\n"
|
|
9395
9385
|
|
|
9396
|
-
# 3.3.13
|
|
9386
|
+
# 3.3.13 then add search_knowledge instructions to the system prompt
|
|
9387
|
+
if self.knowledge is not None and self.search_knowledge and self.add_search_knowledge_instructions:
|
|
9388
|
+
# Prefer async version if available for async databases
|
|
9389
|
+
abuild_context_fn = getattr(self.knowledge, "abuild_context", None)
|
|
9390
|
+
build_context_fn = getattr(self.knowledge, "build_context", None)
|
|
9391
|
+
if callable(abuild_context_fn):
|
|
9392
|
+
knowledge_context = await abuild_context_fn(
|
|
9393
|
+
enable_agentic_filters=self.enable_agentic_knowledge_filters,
|
|
9394
|
+
)
|
|
9395
|
+
if knowledge_context is not None:
|
|
9396
|
+
system_message_content += knowledge_context + "\n"
|
|
9397
|
+
elif callable(build_context_fn):
|
|
9398
|
+
knowledge_context = build_context_fn(
|
|
9399
|
+
enable_agentic_filters=self.enable_agentic_knowledge_filters,
|
|
9400
|
+
)
|
|
9401
|
+
if knowledge_context is not None:
|
|
9402
|
+
system_message_content += knowledge_context + "\n"
|
|
9403
|
+
|
|
9404
|
+
# 3.3.14 Add the system message from the Model
|
|
9397
9405
|
system_message_from_model = self.model.get_system_message_for_model(tools)
|
|
9398
9406
|
if system_message_from_model is not None:
|
|
9399
9407
|
system_message_content += system_message_from_model
|
|
9400
9408
|
|
|
9401
|
-
# 3.3.
|
|
9409
|
+
# 3.3.15 Add the JSON output prompt if output_schema is provided and the model does not support native structured outputs or JSON schema outputs
|
|
9402
9410
|
# or if use_json_mode is True
|
|
9403
9411
|
if (
|
|
9404
9412
|
output_schema is not None
|
|
@@ -9410,11 +9418,11 @@ class Agent:
|
|
|
9410
9418
|
):
|
|
9411
9419
|
system_message_content += f"{get_json_output_prompt(output_schema)}" # type: ignore
|
|
9412
9420
|
|
|
9413
|
-
# 3.3.
|
|
9421
|
+
# 3.3.16 Add the response model format prompt if output_schema is provided (Pydantic only)
|
|
9414
9422
|
if output_schema is not None and self.parser_model is not None and not isinstance(output_schema, dict):
|
|
9415
9423
|
system_message_content += f"{get_response_model_format_prompt(output_schema)}"
|
|
9416
9424
|
|
|
9417
|
-
# 3.3.
|
|
9425
|
+
# 3.3.17 Add the session state to the system message
|
|
9418
9426
|
if add_session_state_to_context and session_state is not None:
|
|
9419
9427
|
system_message_content += self._get_formatted_session_state_for_system_message(session_state)
|
|
9420
9428
|
|
agno/db/surrealdb/models.py
CHANGED
|
@@ -48,7 +48,7 @@ def surrealize_dates(record: dict) -> dict:
|
|
|
48
48
|
if isinstance(value, date):
|
|
49
49
|
copy[key] = datetime.combine(value, datetime.min.time()).replace(tzinfo=timezone.utc)
|
|
50
50
|
elif key in ["created_at", "updated_at"] and isinstance(value, (int, float)):
|
|
51
|
-
copy[key] = datetime.fromtimestamp(value
|
|
51
|
+
copy[key] = datetime.fromtimestamp(value, tz=timezone.utc)
|
|
52
52
|
elif key in ["created_at", "updated_at"] and isinstance(value, str):
|
|
53
53
|
# Handle ISO string format - convert back to datetime object for SurrealDB
|
|
54
54
|
try:
|
|
@@ -55,11 +55,7 @@ class AgenticChunking(ChunkingStrategy):
|
|
|
55
55
|
chunk = remaining_text[:break_point].strip()
|
|
56
56
|
meta_data = chunk_meta_data.copy()
|
|
57
57
|
meta_data["chunk"] = chunk_number
|
|
58
|
-
chunk_id =
|
|
59
|
-
if document.id:
|
|
60
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
61
|
-
elif document.name:
|
|
62
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
58
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk)
|
|
63
59
|
meta_data["chunk_size"] = len(chunk)
|
|
64
60
|
chunks.append(
|
|
65
61
|
Document(
|
agno/knowledge/chunking/code.py
CHANGED
|
@@ -82,7 +82,7 @@ class CodeChunking(ChunkingStrategy):
|
|
|
82
82
|
for i, chunk in enumerate(chunks, 1):
|
|
83
83
|
meta_data = document.meta_data.copy()
|
|
84
84
|
meta_data["chunk"] = i
|
|
85
|
-
chunk_id =
|
|
85
|
+
chunk_id = self._generate_chunk_id(document, i, chunk.text)
|
|
86
86
|
meta_data["chunk_size"] = len(chunk.text)
|
|
87
87
|
|
|
88
88
|
chunked_documents.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk.text))
|
|
@@ -38,17 +38,10 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
38
38
|
if current_chunk:
|
|
39
39
|
meta_data = chunk_meta_data.copy()
|
|
40
40
|
meta_data["chunk"] = chunk_number
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
46
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
47
|
-
chunks.append(
|
|
48
|
-
Document(
|
|
49
|
-
id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk)
|
|
50
|
-
)
|
|
51
|
-
)
|
|
41
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
42
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
43
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
44
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
52
45
|
chunk_number += 1
|
|
53
46
|
current_chunk = []
|
|
54
47
|
current_size = 0
|
|
@@ -70,18 +63,15 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
70
63
|
if current_chunk:
|
|
71
64
|
meta_data = chunk_meta_data.copy()
|
|
72
65
|
meta_data["chunk"] = chunk_number
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
elif document.name:
|
|
77
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
78
|
-
meta_data["chunk_size"] = len(" ".join(current_chunk))
|
|
66
|
+
chunk_content = " ".join(current_chunk)
|
|
67
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
68
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
79
69
|
chunks.append(
|
|
80
70
|
Document(
|
|
81
71
|
id=chunk_id,
|
|
82
72
|
name=document.name,
|
|
83
73
|
meta_data=meta_data,
|
|
84
|
-
content=
|
|
74
|
+
content=chunk_content,
|
|
85
75
|
)
|
|
86
76
|
)
|
|
87
77
|
chunk_number += 1
|
|
@@ -94,18 +84,11 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
94
84
|
else:
|
|
95
85
|
meta_data = chunk_meta_data.copy()
|
|
96
86
|
meta_data["chunk"] = chunk_number
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
elif document.name:
|
|
101
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
102
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
87
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
88
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
89
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
103
90
|
if current_chunk:
|
|
104
|
-
chunks.append(
|
|
105
|
-
Document(
|
|
106
|
-
id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk)
|
|
107
|
-
)
|
|
108
|
-
)
|
|
91
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
109
92
|
chunk_number += 1
|
|
110
93
|
current_chunk = [para]
|
|
111
94
|
current_size = para_size
|
|
@@ -113,15 +96,10 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
113
96
|
if current_chunk:
|
|
114
97
|
meta_data = chunk_meta_data.copy()
|
|
115
98
|
meta_data["chunk"] = chunk_number
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
121
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
122
|
-
chunks.append(
|
|
123
|
-
Document(id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk))
|
|
124
|
-
)
|
|
99
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
100
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
101
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
102
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
125
103
|
|
|
126
104
|
# Handle overlap if specified
|
|
127
105
|
if self.overlap > 0:
|
|
@@ -131,11 +109,11 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
131
109
|
# Add overlap from previous chunk
|
|
132
110
|
prev_text = chunks[i - 1].content[-self.overlap :]
|
|
133
111
|
meta_data = chunk_meta_data.copy()
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
112
|
+
# Use the chunk's existing metadata and ID instead of stale chunk_number
|
|
113
|
+
meta_data["chunk"] = chunks[i].meta_data["chunk"]
|
|
114
|
+
chunk_id = chunks[i].id
|
|
138
115
|
meta_data["chunk_size"] = len(prev_text + chunks[i].content)
|
|
116
|
+
|
|
139
117
|
if prev_text:
|
|
140
118
|
overlapped_chunks.append(
|
|
141
119
|
Document(
|
|
@@ -145,6 +123,8 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
145
123
|
content=prev_text + chunks[i].content,
|
|
146
124
|
)
|
|
147
125
|
)
|
|
126
|
+
else:
|
|
127
|
+
overlapped_chunks.append(chunks[i])
|
|
148
128
|
else:
|
|
149
129
|
overlapped_chunks.append(chunks[i])
|
|
150
130
|
chunks = overlapped_chunks
|
agno/knowledge/chunking/fixed.py
CHANGED
|
@@ -38,11 +38,7 @@ class FixedSizeChunking(ChunkingStrategy):
|
|
|
38
38
|
chunk = content[start:end]
|
|
39
39
|
meta_data = chunk_meta_data.copy()
|
|
40
40
|
meta_data["chunk"] = chunk_number
|
|
41
|
-
chunk_id =
|
|
42
|
-
if document.id:
|
|
43
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
44
|
-
elif document.name:
|
|
45
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
41
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk)
|
|
46
42
|
meta_data["chunk_size"] = len(chunk)
|
|
47
43
|
chunked_documents.append(
|
|
48
44
|
Document(
|
|
@@ -267,11 +267,7 @@ class MarkdownChunking(ChunkingStrategy):
|
|
|
267
267
|
for sub_chunk in sub_chunks:
|
|
268
268
|
meta_data = chunk_meta_data.copy()
|
|
269
269
|
meta_data["chunk"] = chunk_number
|
|
270
|
-
chunk_id =
|
|
271
|
-
if document.id:
|
|
272
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
273
|
-
elif document.name:
|
|
274
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
270
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, sub_chunk)
|
|
275
271
|
meta_data["chunk_size"] = len(sub_chunk)
|
|
276
272
|
|
|
277
273
|
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=sub_chunk))
|
|
@@ -282,19 +278,12 @@ class MarkdownChunking(ChunkingStrategy):
|
|
|
282
278
|
else:
|
|
283
279
|
meta_data = chunk_meta_data.copy()
|
|
284
280
|
meta_data["chunk"] = chunk_number
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
elif document.name:
|
|
289
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
290
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
281
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
282
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
283
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
291
284
|
|
|
292
285
|
if current_chunk:
|
|
293
|
-
chunks.append(
|
|
294
|
-
Document(
|
|
295
|
-
id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk)
|
|
296
|
-
)
|
|
297
|
-
)
|
|
286
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
298
287
|
chunk_number += 1
|
|
299
288
|
|
|
300
289
|
current_chunk = [section]
|
|
@@ -304,15 +293,10 @@ class MarkdownChunking(ChunkingStrategy):
|
|
|
304
293
|
if current_chunk and not self.split_on_headings:
|
|
305
294
|
meta_data = chunk_meta_data.copy()
|
|
306
295
|
meta_data["chunk"] = chunk_number
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
312
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
313
|
-
chunks.append(
|
|
314
|
-
Document(id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk))
|
|
315
|
-
)
|
|
296
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
297
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
298
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
299
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
316
300
|
|
|
317
301
|
# Handle overlap if specified
|
|
318
302
|
if self.overlap > 0:
|
|
@@ -46,9 +46,7 @@ class RecursiveChunking(ChunkingStrategy):
|
|
|
46
46
|
chunk = self.clean_text(content[start:end])
|
|
47
47
|
meta_data = chunk_meta_data.copy()
|
|
48
48
|
meta_data["chunk"] = chunk_number
|
|
49
|
-
chunk_id =
|
|
50
|
-
if document.id:
|
|
51
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
49
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk)
|
|
52
50
|
chunk_number += 1
|
|
53
51
|
meta_data["chunk_size"] = len(chunk)
|
|
54
52
|
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk))
|
agno/knowledge/chunking/row.py
CHANGED
|
@@ -33,7 +33,8 @@ class RowChunking(ChunkingStrategy):
|
|
|
33
33
|
|
|
34
34
|
if chunk_content: # Skip empty rows
|
|
35
35
|
meta_data = document.meta_data.copy()
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
row_number = start_index + i
|
|
37
|
+
meta_data["row_number"] = row_number # Preserve logical row numbering
|
|
38
|
+
chunk_id = self._generate_chunk_id(document, row_number, chunk_content, prefix="row")
|
|
38
39
|
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
39
40
|
return chunks
|
|
@@ -160,7 +160,7 @@ class SemanticChunking(ChunkingStrategy):
|
|
|
160
160
|
for i, chunk in enumerate(chunks, 1):
|
|
161
161
|
meta_data = document.meta_data.copy()
|
|
162
162
|
meta_data["chunk"] = i
|
|
163
|
-
chunk_id =
|
|
163
|
+
chunk_id = self._generate_chunk_id(document, i, chunk.text)
|
|
164
164
|
meta_data["chunk_size"] = len(chunk.text)
|
|
165
165
|
|
|
166
166
|
chunked_documents.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk.text))
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import List, Optional
|
|
@@ -12,6 +13,24 @@ class ChunkingStrategy(ABC):
|
|
|
12
13
|
def chunk(self, document: Document) -> List[Document]:
|
|
13
14
|
raise NotImplementedError
|
|
14
15
|
|
|
16
|
+
def _generate_chunk_id(
|
|
17
|
+
self, document: Document, chunk_number: int, content: Optional[str] = None, prefix: Optional[str] = None
|
|
18
|
+
) -> Optional[str]:
|
|
19
|
+
"""Generate a deterministic ID for the chunk."""
|
|
20
|
+
suffix = f"_{prefix}_{chunk_number}" if prefix else f"_{chunk_number}"
|
|
21
|
+
|
|
22
|
+
if document.id:
|
|
23
|
+
return f"{document.id}{suffix}"
|
|
24
|
+
elif document.name:
|
|
25
|
+
return f"{document.name}{suffix}"
|
|
26
|
+
else:
|
|
27
|
+
# Hash the chunk content for a deterministic ID when no identifier exists
|
|
28
|
+
hash_source = content if content else document.content
|
|
29
|
+
if hash_source:
|
|
30
|
+
content_hash = hashlib.md5(hash_source.encode("utf-8")).hexdigest()[:12] # nosec B324
|
|
31
|
+
return f"chunk_{content_hash}{suffix}"
|
|
32
|
+
return None
|
|
33
|
+
|
|
15
34
|
async def achunk(self, document: Document) -> List[Document]:
|
|
16
35
|
"""Async version of chunk. Override for truly async implementations."""
|
|
17
36
|
return self.chunk(document)
|