MemoryOS 0.2.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MemoryOS might be problematic. Click here for more details.
- {memoryos-0.2.2.dist-info → memoryos-1.0.1.dist-info}/METADATA +7 -1
- {memoryos-0.2.2.dist-info → memoryos-1.0.1.dist-info}/RECORD +81 -66
- memos/__init__.py +1 -1
- memos/api/config.py +31 -8
- memos/api/context/context.py +1 -1
- memos/api/context/context_thread.py +96 -0
- memos/api/middleware/request_context.py +94 -0
- memos/api/product_api.py +5 -1
- memos/api/product_models.py +16 -0
- memos/api/routers/product_router.py +39 -3
- memos/api/start_api.py +3 -0
- memos/configs/internet_retriever.py +13 -0
- memos/configs/mem_scheduler.py +38 -16
- memos/configs/memory.py +13 -0
- memos/configs/reranker.py +18 -0
- memos/graph_dbs/base.py +33 -4
- memos/graph_dbs/nebular.py +631 -236
- memos/graph_dbs/neo4j.py +18 -7
- memos/graph_dbs/neo4j_community.py +6 -3
- memos/llms/vllm.py +2 -0
- memos/log.py +125 -8
- memos/mem_os/core.py +49 -11
- memos/mem_os/main.py +1 -1
- memos/mem_os/product.py +392 -215
- memos/mem_os/utils/default_config.py +1 -1
- memos/mem_os/utils/format_utils.py +11 -47
- memos/mem_os/utils/reference_utils.py +153 -0
- memos/mem_reader/simple_struct.py +112 -43
- memos/mem_scheduler/base_scheduler.py +58 -55
- memos/mem_scheduler/{modules → general_modules}/base.py +1 -2
- memos/mem_scheduler/{modules → general_modules}/dispatcher.py +54 -15
- memos/mem_scheduler/{modules → general_modules}/rabbitmq_service.py +4 -4
- memos/mem_scheduler/{modules → general_modules}/redis_service.py +1 -1
- memos/mem_scheduler/{modules → general_modules}/retriever.py +19 -5
- memos/mem_scheduler/{modules → general_modules}/scheduler_logger.py +10 -4
- memos/mem_scheduler/general_scheduler.py +110 -67
- memos/mem_scheduler/monitors/__init__.py +0 -0
- memos/mem_scheduler/monitors/dispatcher_monitor.py +305 -0
- memos/mem_scheduler/{modules/monitor.py → monitors/general_monitor.py} +57 -19
- memos/mem_scheduler/mos_for_test_scheduler.py +7 -1
- memos/mem_scheduler/schemas/general_schemas.py +3 -2
- memos/mem_scheduler/schemas/message_schemas.py +2 -1
- memos/mem_scheduler/schemas/monitor_schemas.py +10 -2
- memos/mem_scheduler/utils/misc_utils.py +43 -2
- memos/mem_user/mysql_user_manager.py +4 -2
- memos/memories/activation/item.py +1 -1
- memos/memories/activation/kv.py +20 -8
- memos/memories/textual/base.py +1 -1
- memos/memories/textual/general.py +1 -1
- memos/memories/textual/item.py +1 -1
- memos/memories/textual/tree.py +31 -1
- memos/memories/textual/tree_text_memory/organize/{conflict.py → handler.py} +30 -48
- memos/memories/textual/tree_text_memory/organize/manager.py +8 -96
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +2 -0
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +102 -140
- memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +231 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +9 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +67 -10
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +1 -1
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +246 -134
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +7 -2
- memos/memories/textual/tree_text_memory/retrieve/utils.py +7 -5
- memos/memos_tools/lockfree_dict.py +120 -0
- memos/memos_tools/notification_utils.py +46 -0
- memos/memos_tools/thread_safe_dict.py +288 -0
- memos/reranker/__init__.py +4 -0
- memos/reranker/base.py +24 -0
- memos/reranker/cosine_local.py +95 -0
- memos/reranker/factory.py +43 -0
- memos/reranker/http_bge.py +99 -0
- memos/reranker/noop.py +16 -0
- memos/templates/mem_reader_prompts.py +290 -39
- memos/templates/mem_scheduler_prompts.py +23 -10
- memos/templates/mos_prompts.py +133 -31
- memos/templates/tree_reorganize_prompts.py +24 -17
- memos/utils.py +19 -0
- memos/memories/textual/tree_text_memory/organize/redundancy.py +0 -193
- {memoryos-0.2.2.dist-info → memoryos-1.0.1.dist-info}/LICENSE +0 -0
- {memoryos-0.2.2.dist-info → memoryos-1.0.1.dist-info}/WHEEL +0 -0
- {memoryos-0.2.2.dist-info → memoryos-1.0.1.dist-info}/entry_points.txt +0 -0
- /memos/mem_scheduler/{modules → general_modules}/__init__.py +0 -0
- /memos/mem_scheduler/{modules → general_modules}/misc.py +0 -0
|
@@ -112,7 +112,7 @@ def get_default_config(
|
|
|
112
112
|
"thread_pool_max_workers": kwargs.get("scheduler_thread_pool_max_workers", 10),
|
|
113
113
|
"consume_interval_seconds": kwargs.get("scheduler_consume_interval_seconds", 3),
|
|
114
114
|
"enable_parallel_dispatch": kwargs.get("scheduler_enable_parallel_dispatch", True),
|
|
115
|
-
"
|
|
115
|
+
"enable_activation_memory": True,
|
|
116
116
|
},
|
|
117
117
|
}
|
|
118
118
|
|
|
@@ -570,15 +570,23 @@ def convert_graph_to_tree_forworkmem(
|
|
|
570
570
|
else:
|
|
571
571
|
other_roots.append(root_id)
|
|
572
572
|
|
|
573
|
-
def build_tree(node_id: str) -> dict[str, Any]:
|
|
574
|
-
"""Recursively build tree structure"""
|
|
573
|
+
def build_tree(node_id: str, visited=None) -> dict[str, Any] | None:
|
|
574
|
+
"""Recursively build tree structure with cycle detection"""
|
|
575
|
+
if visited is None:
|
|
576
|
+
visited = set()
|
|
577
|
+
|
|
578
|
+
if node_id in visited:
|
|
579
|
+
logger.warning(f"[build_tree] Detected cycle at node {node_id}, skipping.")
|
|
580
|
+
return None
|
|
581
|
+
visited.add(node_id)
|
|
582
|
+
|
|
575
583
|
if node_id not in node_map:
|
|
576
584
|
return None
|
|
577
585
|
|
|
578
586
|
children_ids = children_map.get(node_id, [])
|
|
579
587
|
children = []
|
|
580
588
|
for child_id in children_ids:
|
|
581
|
-
child_tree = build_tree(child_id)
|
|
589
|
+
child_tree = build_tree(child_id, visited)
|
|
582
590
|
if child_tree:
|
|
583
591
|
children.append(child_tree)
|
|
584
592
|
|
|
@@ -1355,47 +1363,3 @@ def clean_json_response(response: str) -> str:
|
|
|
1355
1363
|
str: Clean JSON string without markdown formatting
|
|
1356
1364
|
"""
|
|
1357
1365
|
return response.replace("```json", "").replace("```", "").strip()
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
def split_continuous_references(text: str) -> str:
|
|
1361
|
-
"""
|
|
1362
|
-
Split continuous reference tags into individual reference tags.
|
|
1363
|
-
|
|
1364
|
-
Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044]
|
|
1365
|
-
|
|
1366
|
-
Only processes text if:
|
|
1367
|
-
1. '[' appears exactly once
|
|
1368
|
-
2. ']' appears exactly once
|
|
1369
|
-
3. Contains commas between '[' and ']'
|
|
1370
|
-
|
|
1371
|
-
Args:
|
|
1372
|
-
text (str): Text containing reference tags
|
|
1373
|
-
|
|
1374
|
-
Returns:
|
|
1375
|
-
str: Text with split reference tags, or original text if conditions not met
|
|
1376
|
-
"""
|
|
1377
|
-
# Early return if text is empty
|
|
1378
|
-
if not text:
|
|
1379
|
-
return text
|
|
1380
|
-
# Check if '[' appears exactly once
|
|
1381
|
-
if text.count("[") != 1:
|
|
1382
|
-
return text
|
|
1383
|
-
# Check if ']' appears exactly once
|
|
1384
|
-
if text.count("]") != 1:
|
|
1385
|
-
return text
|
|
1386
|
-
# Find positions of brackets
|
|
1387
|
-
open_bracket_pos = text.find("[")
|
|
1388
|
-
close_bracket_pos = text.find("]")
|
|
1389
|
-
|
|
1390
|
-
# Check if brackets are in correct order
|
|
1391
|
-
if open_bracket_pos >= close_bracket_pos:
|
|
1392
|
-
return text
|
|
1393
|
-
# Extract content between brackets
|
|
1394
|
-
content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos]
|
|
1395
|
-
# Check if there's a comma between brackets
|
|
1396
|
-
if "," not in content_between_brackets:
|
|
1397
|
-
return text
|
|
1398
|
-
text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "]["))
|
|
1399
|
-
text = text.replace(content_between_brackets, content_between_brackets.replace(",", "]["))
|
|
1400
|
-
|
|
1401
|
-
return text
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from memos.memories.textual.item import (
|
|
2
|
+
TextualMemoryItem,
|
|
3
|
+
)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def split_continuous_references(text: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Split continuous reference tags into individual reference tags.
|
|
9
|
+
|
|
10
|
+
Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044]
|
|
11
|
+
|
|
12
|
+
Only processes text if:
|
|
13
|
+
1. '[' appears exactly once
|
|
14
|
+
2. ']' appears exactly once
|
|
15
|
+
3. Contains commas between '[' and ']'
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text (str): Text containing reference tags
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: Text with split reference tags, or original text if conditions not met
|
|
22
|
+
"""
|
|
23
|
+
# Early return if text is empty
|
|
24
|
+
if not text:
|
|
25
|
+
return text
|
|
26
|
+
# Check if '[' appears exactly once
|
|
27
|
+
if text.count("[") != 1:
|
|
28
|
+
return text
|
|
29
|
+
# Check if ']' appears exactly once
|
|
30
|
+
if text.count("]") != 1:
|
|
31
|
+
return text
|
|
32
|
+
# Find positions of brackets
|
|
33
|
+
open_bracket_pos = text.find("[")
|
|
34
|
+
close_bracket_pos = text.find("]")
|
|
35
|
+
|
|
36
|
+
# Check if brackets are in correct order
|
|
37
|
+
if open_bracket_pos >= close_bracket_pos:
|
|
38
|
+
return text
|
|
39
|
+
# Extract content between brackets
|
|
40
|
+
content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos]
|
|
41
|
+
# Check if there's a comma between brackets
|
|
42
|
+
if "," not in content_between_brackets:
|
|
43
|
+
return text
|
|
44
|
+
text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "]["))
|
|
45
|
+
text = text.replace(content_between_brackets, content_between_brackets.replace(",", "]["))
|
|
46
|
+
|
|
47
|
+
return text
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def process_streaming_references_complete(text_buffer: str) -> tuple[str, str]:
|
|
51
|
+
"""
|
|
52
|
+
Complete streaming reference processing to ensure reference tags are never split.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
text_buffer (str): The accumulated text buffer.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
tuple[str, str]: (processed_text, remaining_buffer)
|
|
59
|
+
"""
|
|
60
|
+
import re
|
|
61
|
+
|
|
62
|
+
# Pattern to match complete reference tags: [refid:memoriesID]
|
|
63
|
+
complete_pattern = r"\[\d+:[^\]]+\]"
|
|
64
|
+
|
|
65
|
+
# Find all complete reference tags
|
|
66
|
+
complete_matches = list(re.finditer(complete_pattern, text_buffer))
|
|
67
|
+
|
|
68
|
+
if complete_matches:
|
|
69
|
+
# Find the last complete tag
|
|
70
|
+
last_match = complete_matches[-1]
|
|
71
|
+
end_pos = last_match.end()
|
|
72
|
+
|
|
73
|
+
# Check if there's any incomplete reference after the last complete one
|
|
74
|
+
remaining_text = text_buffer[end_pos:]
|
|
75
|
+
|
|
76
|
+
# Look for potential incomplete reference patterns after the last complete tag
|
|
77
|
+
incomplete_pattern = r"\[\d*:?[^\]]*$"
|
|
78
|
+
if re.search(incomplete_pattern, remaining_text):
|
|
79
|
+
# There's a potential incomplete reference, find where it starts
|
|
80
|
+
incomplete_match = re.search(incomplete_pattern, remaining_text)
|
|
81
|
+
if incomplete_match:
|
|
82
|
+
incomplete_start = end_pos + incomplete_match.start()
|
|
83
|
+
processed_text = text_buffer[:incomplete_start]
|
|
84
|
+
remaining_buffer = text_buffer[incomplete_start:]
|
|
85
|
+
|
|
86
|
+
# Apply reference splitting to the processed text
|
|
87
|
+
processed_text = split_continuous_references(processed_text)
|
|
88
|
+
return processed_text, remaining_buffer
|
|
89
|
+
|
|
90
|
+
# No incomplete reference after the last complete tag, process all
|
|
91
|
+
processed_text = split_continuous_references(text_buffer)
|
|
92
|
+
return processed_text, ""
|
|
93
|
+
|
|
94
|
+
# Check for incomplete reference tags - be more specific about what constitutes a potential reference
|
|
95
|
+
# Look for opening bracket with number and colon that could be a reference tag
|
|
96
|
+
opening_pattern = r"\[\d+:"
|
|
97
|
+
opening_matches = list(re.finditer(opening_pattern, text_buffer))
|
|
98
|
+
|
|
99
|
+
if opening_matches:
|
|
100
|
+
# Find the last opening tag
|
|
101
|
+
last_opening = opening_matches[-1]
|
|
102
|
+
opening_start = last_opening.start()
|
|
103
|
+
|
|
104
|
+
# Check if this might be a complete reference tag (has closing bracket after the pattern)
|
|
105
|
+
remaining_text = text_buffer[last_opening.end() :]
|
|
106
|
+
if "]" in remaining_text:
|
|
107
|
+
# This looks like a complete reference tag, process it
|
|
108
|
+
processed_text = split_continuous_references(text_buffer)
|
|
109
|
+
return processed_text, ""
|
|
110
|
+
else:
|
|
111
|
+
# Incomplete reference tag, keep it in buffer
|
|
112
|
+
processed_text = text_buffer[:opening_start]
|
|
113
|
+
processed_text = split_continuous_references(processed_text)
|
|
114
|
+
return processed_text, text_buffer[opening_start:]
|
|
115
|
+
|
|
116
|
+
# More sophisticated check for potential reference patterns
|
|
117
|
+
# Only hold back text if we see a pattern that could be the start of a reference tag
|
|
118
|
+
potential_ref_pattern = r"\[\d*:?$" # Matches [, [1, [12:, etc. at end of buffer
|
|
119
|
+
if re.search(potential_ref_pattern, text_buffer):
|
|
120
|
+
# Find the position of the potential reference start
|
|
121
|
+
match = re.search(potential_ref_pattern, text_buffer)
|
|
122
|
+
if match:
|
|
123
|
+
ref_start = match.start()
|
|
124
|
+
processed_text = text_buffer[:ref_start]
|
|
125
|
+
processed_text = split_continuous_references(processed_text)
|
|
126
|
+
return processed_text, text_buffer[ref_start:]
|
|
127
|
+
|
|
128
|
+
# Check for standalone [ only at the very end of the buffer
|
|
129
|
+
# This prevents cutting off mathematical expressions like [ \Delta U = Q - W ]
|
|
130
|
+
if text_buffer.endswith("["):
|
|
131
|
+
# Only hold back the single [ character
|
|
132
|
+
processed_text = text_buffer[:-1]
|
|
133
|
+
processed_text = split_continuous_references(processed_text)
|
|
134
|
+
return processed_text, "["
|
|
135
|
+
|
|
136
|
+
# No reference-like patterns found, process all text
|
|
137
|
+
processed_text = split_continuous_references(text_buffer)
|
|
138
|
+
return processed_text, ""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def prepare_reference_data(memories_list: list[TextualMemoryItem]) -> list[dict]:
|
|
142
|
+
# Prepare reference data
|
|
143
|
+
reference = []
|
|
144
|
+
for memories in memories_list:
|
|
145
|
+
memories_json = memories.model_dump()
|
|
146
|
+
memories_json["metadata"]["ref_id"] = f"{memories.id.split('-')[0]}"
|
|
147
|
+
memories_json["metadata"]["embedding"] = []
|
|
148
|
+
memories_json["metadata"]["sources"] = []
|
|
149
|
+
memories_json["metadata"]["memory"] = memories.memory
|
|
150
|
+
memories_json["metadata"]["id"] = memories.id
|
|
151
|
+
reference.append({"metadata": memories_json["metadata"]})
|
|
152
|
+
|
|
153
|
+
return reference
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import copy
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
4
6
|
|
|
5
7
|
from abc import ABC
|
|
6
8
|
from typing import Any
|
|
7
9
|
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
8
12
|
from memos import log
|
|
9
13
|
from memos.chunkers import ChunkerFactory
|
|
10
14
|
from memos.configs.mem_reader import SimpleStructMemReaderConfig
|
|
@@ -16,12 +20,79 @@ from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemory
|
|
|
16
20
|
from memos.parsers.factory import ParserFactory
|
|
17
21
|
from memos.templates.mem_reader_prompts import (
|
|
18
22
|
SIMPLE_STRUCT_DOC_READER_PROMPT,
|
|
23
|
+
SIMPLE_STRUCT_DOC_READER_PROMPT_ZH,
|
|
19
24
|
SIMPLE_STRUCT_MEM_READER_EXAMPLE,
|
|
25
|
+
SIMPLE_STRUCT_MEM_READER_EXAMPLE_ZH,
|
|
20
26
|
SIMPLE_STRUCT_MEM_READER_PROMPT,
|
|
27
|
+
SIMPLE_STRUCT_MEM_READER_PROMPT_ZH,
|
|
21
28
|
)
|
|
22
29
|
|
|
23
30
|
|
|
24
31
|
logger = log.get_logger(__name__)
|
|
32
|
+
PROMPT_DICT = {
|
|
33
|
+
"chat": {
|
|
34
|
+
"en": SIMPLE_STRUCT_MEM_READER_PROMPT,
|
|
35
|
+
"zh": SIMPLE_STRUCT_MEM_READER_PROMPT_ZH,
|
|
36
|
+
"en_example": SIMPLE_STRUCT_MEM_READER_EXAMPLE,
|
|
37
|
+
"zh_example": SIMPLE_STRUCT_MEM_READER_EXAMPLE_ZH,
|
|
38
|
+
},
|
|
39
|
+
"doc": {"en": SIMPLE_STRUCT_DOC_READER_PROMPT, "zh": SIMPLE_STRUCT_DOC_READER_PROMPT_ZH},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def detect_lang(text):
|
|
44
|
+
try:
|
|
45
|
+
if not text or not isinstance(text, str):
|
|
46
|
+
return "en"
|
|
47
|
+
chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]"
|
|
48
|
+
chinese_chars = re.findall(chinese_pattern, text)
|
|
49
|
+
if len(chinese_chars) / len(re.sub(r"[\s\d\W]", "", text)) > 0.3:
|
|
50
|
+
return "zh"
|
|
51
|
+
return "en"
|
|
52
|
+
except Exception:
|
|
53
|
+
return "en"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _build_node(idx, message, info, scene_file, llm, parse_json_result, embedder):
|
|
57
|
+
# generate
|
|
58
|
+
raw = llm.generate(message)
|
|
59
|
+
if not raw:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
# parse_json_result
|
|
63
|
+
chunk_res = parse_json_result(raw)
|
|
64
|
+
if not chunk_res:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
value = chunk_res.get("value")
|
|
68
|
+
if not value:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
# embed
|
|
72
|
+
embedding = embedder.embed([value])[0]
|
|
73
|
+
|
|
74
|
+
# TextualMemoryItem
|
|
75
|
+
tags = chunk_res["tags"] if isinstance(chunk_res.get("tags"), list) else []
|
|
76
|
+
key = chunk_res.get("key", None)
|
|
77
|
+
|
|
78
|
+
node_i = TextualMemoryItem(
|
|
79
|
+
memory=value,
|
|
80
|
+
metadata=TreeNodeTextualMemoryMetadata(
|
|
81
|
+
user_id=info.get("user_id"),
|
|
82
|
+
session_id=info.get("session_id"),
|
|
83
|
+
memory_type="LongTermMemory",
|
|
84
|
+
status="activated",
|
|
85
|
+
tags=tags,
|
|
86
|
+
key=key,
|
|
87
|
+
embedding=embedding,
|
|
88
|
+
usage=[],
|
|
89
|
+
sources=[f"{scene_file}_{idx}"],
|
|
90
|
+
background="",
|
|
91
|
+
confidence=0.99,
|
|
92
|
+
type="fact",
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
return node_i
|
|
25
96
|
|
|
26
97
|
|
|
27
98
|
class SimpleStructMemReader(BaseMemReader, ABC):
|
|
@@ -40,11 +111,13 @@ class SimpleStructMemReader(BaseMemReader, ABC):
|
|
|
40
111
|
self.chunker = ChunkerFactory.from_config(config.chunker)
|
|
41
112
|
|
|
42
113
|
def _process_chat_data(self, scene_data_info, info):
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
114
|
+
lang = detect_lang("\n".join(scene_data_info))
|
|
115
|
+
template = PROMPT_DICT["chat"][lang]
|
|
116
|
+
examples = PROMPT_DICT["chat"][f"{lang}_example"]
|
|
117
|
+
|
|
118
|
+
prompt = template.replace("${conversation}", "\n".join(scene_data_info))
|
|
46
119
|
if self.config.remove_prompt_example:
|
|
47
|
-
prompt = prompt.replace(
|
|
120
|
+
prompt = prompt.replace(examples, "")
|
|
48
121
|
|
|
49
122
|
messages = [{"role": "user", "content": prompt}]
|
|
50
123
|
|
|
@@ -180,7 +253,7 @@ class SimpleStructMemReader(BaseMemReader, ABC):
|
|
|
180
253
|
elif type == "doc":
|
|
181
254
|
for item in scene_data:
|
|
182
255
|
try:
|
|
183
|
-
if
|
|
256
|
+
if os.path.exists(item):
|
|
184
257
|
parsed_text = parser.parse(item)
|
|
185
258
|
results.append({"file": "pure_text", "text": parsed_text})
|
|
186
259
|
else:
|
|
@@ -193,46 +266,42 @@ class SimpleStructMemReader(BaseMemReader, ABC):
|
|
|
193
266
|
|
|
194
267
|
def _process_doc_data(self, scene_data_info, info):
|
|
195
268
|
chunks = self.chunker.chunk(scene_data_info["text"])
|
|
196
|
-
messages = [
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
for chunk in chunks
|
|
204
|
-
]
|
|
205
|
-
|
|
206
|
-
processed_chunks = []
|
|
207
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
208
|
-
futures = [executor.submit(self.llm.generate, message) for message in messages]
|
|
209
|
-
for future in concurrent.futures.as_completed(futures):
|
|
210
|
-
chunk_result = future.result()
|
|
211
|
-
if chunk_result:
|
|
212
|
-
processed_chunks.append(chunk_result)
|
|
269
|
+
messages = []
|
|
270
|
+
for chunk in chunks:
|
|
271
|
+
lang = detect_lang(chunk.text)
|
|
272
|
+
template = PROMPT_DICT["doc"][lang]
|
|
273
|
+
prompt = template.replace("{chunk_text}", chunk.text)
|
|
274
|
+
message = [{"role": "user", "content": prompt}]
|
|
275
|
+
messages.append(message)
|
|
213
276
|
|
|
214
|
-
processed_chunks = [self.parse_json_result(r) for r in processed_chunks]
|
|
215
277
|
doc_nodes = []
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
278
|
+
scene_file = scene_data_info["file"]
|
|
279
|
+
|
|
280
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
|
|
281
|
+
futures = {
|
|
282
|
+
executor.submit(
|
|
283
|
+
_build_node,
|
|
284
|
+
idx,
|
|
285
|
+
msg,
|
|
286
|
+
info,
|
|
287
|
+
scene_file,
|
|
288
|
+
self.llm,
|
|
289
|
+
self.parse_json_result,
|
|
290
|
+
self.embedder,
|
|
291
|
+
): idx
|
|
292
|
+
for idx, msg in enumerate(messages)
|
|
293
|
+
}
|
|
294
|
+
total = len(futures)
|
|
295
|
+
|
|
296
|
+
for future in tqdm(
|
|
297
|
+
concurrent.futures.as_completed(futures), total=total, desc="Processing"
|
|
298
|
+
):
|
|
299
|
+
try:
|
|
300
|
+
node = future.result()
|
|
301
|
+
if node:
|
|
302
|
+
doc_nodes.append(node)
|
|
303
|
+
except Exception as e:
|
|
304
|
+
tqdm.write(f"[ERROR] {e}")
|
|
236
305
|
return doc_nodes
|
|
237
306
|
|
|
238
307
|
def parse_json_result(self, response_text):
|