MemoryOS 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MemoryOS might be problematic. Click here for more details.
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/METADATA +6 -1
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/RECORD +61 -55
- memos/__init__.py +1 -1
- memos/api/config.py +6 -8
- memos/api/context/context.py +1 -1
- memos/api/context/dependencies.py +11 -0
- memos/configs/internet_retriever.py +13 -0
- memos/configs/mem_scheduler.py +38 -16
- memos/graph_dbs/base.py +30 -3
- memos/graph_dbs/nebular.py +442 -194
- memos/graph_dbs/neo4j.py +14 -5
- memos/log.py +5 -0
- memos/mem_os/core.py +19 -9
- memos/mem_os/main.py +1 -1
- memos/mem_os/product.py +6 -69
- memos/mem_os/utils/default_config.py +1 -1
- memos/mem_os/utils/format_utils.py +11 -47
- memos/mem_os/utils/reference_utils.py +133 -0
- memos/mem_scheduler/base_scheduler.py +58 -55
- memos/mem_scheduler/{modules → general_modules}/base.py +1 -2
- memos/mem_scheduler/{modules → general_modules}/dispatcher.py +54 -15
- memos/mem_scheduler/{modules → general_modules}/rabbitmq_service.py +4 -4
- memos/mem_scheduler/{modules → general_modules}/redis_service.py +1 -1
- memos/mem_scheduler/{modules → general_modules}/retriever.py +19 -5
- memos/mem_scheduler/{modules → general_modules}/scheduler_logger.py +10 -4
- memos/mem_scheduler/general_scheduler.py +110 -67
- memos/mem_scheduler/monitors/__init__.py +0 -0
- memos/mem_scheduler/monitors/dispatcher_monitor.py +305 -0
- memos/mem_scheduler/{modules/monitor.py → monitors/general_monitor.py} +57 -19
- memos/mem_scheduler/mos_for_test_scheduler.py +7 -1
- memos/mem_scheduler/schemas/general_schemas.py +3 -2
- memos/mem_scheduler/schemas/message_schemas.py +2 -1
- memos/mem_scheduler/schemas/monitor_schemas.py +10 -2
- memos/mem_scheduler/utils/misc_utils.py +43 -2
- memos/memories/activation/item.py +1 -1
- memos/memories/activation/kv.py +20 -8
- memos/memories/textual/base.py +1 -1
- memos/memories/textual/general.py +1 -1
- memos/memories/textual/tree_text_memory/organize/{conflict.py → handler.py} +30 -48
- memos/memories/textual/tree_text_memory/organize/manager.py +8 -96
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +2 -0
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +102 -140
- memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +229 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +9 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +15 -8
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +1 -1
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +177 -125
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +7 -2
- memos/memories/textual/tree_text_memory/retrieve/utils.py +1 -1
- memos/memos_tools/lockfree_dict.py +120 -0
- memos/memos_tools/thread_safe_dict.py +288 -0
- memos/templates/mem_reader_prompts.py +2 -0
- memos/templates/mem_scheduler_prompts.py +23 -10
- memos/templates/mos_prompts.py +40 -11
- memos/templates/tree_reorganize_prompts.py +24 -17
- memos/utils.py +19 -0
- memos/memories/textual/tree_text_memory/organize/redundancy.py +0 -193
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/LICENSE +0 -0
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/WHEEL +0 -0
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/entry_points.txt +0 -0
- /memos/mem_scheduler/{modules → general_modules}/__init__.py +0 -0
- /memos/mem_scheduler/{modules → general_modules}/misc.py +0 -0
memos/graph_dbs/neo4j.py
CHANGED
|
@@ -323,14 +323,16 @@ class Neo4jGraphDB(BaseGraphDB):
|
|
|
323
323
|
return result.single() is not None
|
|
324
324
|
|
|
325
325
|
# Graph Query & Reasoning
|
|
326
|
-
def get_node(self, id: str) -> dict[str, Any] | None:
|
|
326
|
+
def get_node(self, id: str, include_embedding: bool = True) -> dict[str, Any] | None:
|
|
327
327
|
"""
|
|
328
328
|
Retrieve the metadata and memory of a node.
|
|
329
329
|
Args:
|
|
330
330
|
id: Node identifier.
|
|
331
|
+
include_embedding (bool): Whether to include the large embedding field.
|
|
331
332
|
Returns:
|
|
332
333
|
Dictionary of node fields, or None if not found.
|
|
333
334
|
"""
|
|
335
|
+
|
|
334
336
|
where_user = ""
|
|
335
337
|
params = {"id": id}
|
|
336
338
|
if not self.config.use_multi_db and self.config.user_name:
|
|
@@ -343,11 +345,12 @@ class Neo4jGraphDB(BaseGraphDB):
|
|
|
343
345
|
record = session.run(query, params).single()
|
|
344
346
|
return self._parse_node(dict(record["n"])) if record else None
|
|
345
347
|
|
|
346
|
-
def get_nodes(self, ids: list[str]) -> list[dict[str, Any]]:
|
|
348
|
+
def get_nodes(self, ids: list[str], include_embedding: bool = True) -> list[dict[str, Any]]:
|
|
347
349
|
"""
|
|
348
350
|
Retrieve the metadata and memory of a list of nodes.
|
|
349
351
|
Args:
|
|
350
352
|
ids: List of Node identifier.
|
|
353
|
+
include_embedding (bool): Whether to include the large embedding field.
|
|
351
354
|
Returns:
|
|
352
355
|
list[dict]: Parsed node records containing 'id', 'memory', and 'metadata'.
|
|
353
356
|
|
|
@@ -355,6 +358,7 @@ class Neo4jGraphDB(BaseGraphDB):
|
|
|
355
358
|
- Assumes all provided IDs are valid and exist.
|
|
356
359
|
- Returns empty list if input is empty.
|
|
357
360
|
"""
|
|
361
|
+
|
|
358
362
|
if not ids:
|
|
359
363
|
return []
|
|
360
364
|
|
|
@@ -829,7 +833,7 @@ class Neo4jGraphDB(BaseGraphDB):
|
|
|
829
833
|
logger.error(f"[ERROR] Failed to clear database '{self.db_name}': {e}")
|
|
830
834
|
raise
|
|
831
835
|
|
|
832
|
-
def export_graph(self) -> dict[str, Any]:
|
|
836
|
+
def export_graph(self, include_embedding: bool = True) -> dict[str, Any]:
|
|
833
837
|
"""
|
|
834
838
|
Export all graph nodes and edges in a structured form.
|
|
835
839
|
|
|
@@ -910,12 +914,14 @@ class Neo4jGraphDB(BaseGraphDB):
|
|
|
910
914
|
target_id=edge["target"],
|
|
911
915
|
)
|
|
912
916
|
|
|
913
|
-
def get_all_memory_items(self, scope: str) -> list[dict]:
|
|
917
|
+
def get_all_memory_items(self, scope: str, include_embedding: bool = True) -> list[dict]:
|
|
914
918
|
"""
|
|
915
919
|
Retrieve all memory items of a specific memory_type.
|
|
916
920
|
|
|
917
921
|
Args:
|
|
918
922
|
scope (str): Must be one of 'WorkingMemory', 'LongTermMemory', or 'UserMemory'.
|
|
923
|
+
include_embedding (bool): Whether to include the large embedding field.
|
|
924
|
+
Returns:
|
|
919
925
|
|
|
920
926
|
Returns:
|
|
921
927
|
list[dict]: Full list of memory items under this scope.
|
|
@@ -940,12 +946,15 @@ class Neo4jGraphDB(BaseGraphDB):
|
|
|
940
946
|
results = session.run(query, params)
|
|
941
947
|
return [self._parse_node(dict(record["n"])) for record in results]
|
|
942
948
|
|
|
943
|
-
def get_structure_optimization_candidates(
|
|
949
|
+
def get_structure_optimization_candidates(
|
|
950
|
+
self, scope: str, include_embedding: bool = True
|
|
951
|
+
) -> list[dict]:
|
|
944
952
|
"""
|
|
945
953
|
Find nodes that are likely candidates for structure optimization:
|
|
946
954
|
- Isolated nodes, nodes with empty background, or nodes with exactly one child.
|
|
947
955
|
- Plus: the child of any parent node that has exactly one child.
|
|
948
956
|
"""
|
|
957
|
+
|
|
949
958
|
where_clause = """
|
|
950
959
|
WHERE n.memory_type = $scope
|
|
951
960
|
AND n.status = 'activated'
|
memos/log.py
CHANGED
|
@@ -4,9 +4,14 @@ from logging.config import dictConfig
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from sys import stdout
|
|
6
6
|
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
7
9
|
from memos import settings
|
|
8
10
|
|
|
9
11
|
|
|
12
|
+
# Load environment variables
|
|
13
|
+
load_dotenv()
|
|
14
|
+
|
|
10
15
|
selected_log_level = logging.DEBUG if settings.DEBUG else logging.WARNING
|
|
11
16
|
|
|
12
17
|
|
memos/mem_os/core.py
CHANGED
|
@@ -24,6 +24,7 @@ from memos.mem_user.user_manager import UserManager, UserRole
|
|
|
24
24
|
from memos.memories.activation.item import ActivationMemoryItem
|
|
25
25
|
from memos.memories.parametric.item import ParametricMemoryItem
|
|
26
26
|
from memos.memories.textual.item import TextualMemoryItem, TextualMemoryMetadata
|
|
27
|
+
from memos.memos_tools.thread_safe_dict import ThreadSafeDict
|
|
27
28
|
from memos.templates.mos_prompts import QUERY_REWRITING_PROMPT
|
|
28
29
|
from memos.types import ChatHistory, MessageList, MOSSearchResult
|
|
29
30
|
|
|
@@ -42,10 +43,13 @@ class MOSCore:
|
|
|
42
43
|
self.config = config
|
|
43
44
|
self.user_id = config.user_id
|
|
44
45
|
self.session_id = config.session_id
|
|
45
|
-
self.mem_cubes: dict[str, GeneralMemCube] = {}
|
|
46
46
|
self.chat_llm = LLMFactory.from_config(config.chat_model)
|
|
47
47
|
self.mem_reader = MemReaderFactory.from_config(config.mem_reader)
|
|
48
48
|
self.chat_history_manager: dict[str, ChatHistory] = {}
|
|
49
|
+
# use thread safe dict for multi-user product-server scenario
|
|
50
|
+
self.mem_cubes: ThreadSafeDict[str, GeneralMemCube] = (
|
|
51
|
+
ThreadSafeDict() if user_manager is not None else {}
|
|
52
|
+
)
|
|
49
53
|
self._register_chat_history()
|
|
50
54
|
|
|
51
55
|
# Use provided user_manager or create a new one
|
|
@@ -124,7 +128,7 @@ class MOSCore:
|
|
|
124
128
|
chat_llm=self.chat_llm, process_llm=self.chat_llm
|
|
125
129
|
)
|
|
126
130
|
else:
|
|
127
|
-
# Configure scheduler
|
|
131
|
+
# Configure scheduler general_modules
|
|
128
132
|
self._mem_scheduler.initialize_modules(
|
|
129
133
|
chat_llm=self.chat_llm, process_llm=self.mem_reader.llm
|
|
130
134
|
)
|
|
@@ -185,7 +189,7 @@ class MOSCore:
|
|
|
185
189
|
self.chat_history_manager[user_id] = ChatHistory(
|
|
186
190
|
user_id=user_id,
|
|
187
191
|
session_id=self.session_id,
|
|
188
|
-
created_at=datetime.
|
|
192
|
+
created_at=datetime.utcnow(),
|
|
189
193
|
total_messages=0,
|
|
190
194
|
chat_history=[],
|
|
191
195
|
)
|
|
@@ -279,7 +283,7 @@ class MOSCore:
|
|
|
279
283
|
mem_cube=mem_cube,
|
|
280
284
|
label=QUERY_LABEL,
|
|
281
285
|
content=query,
|
|
282
|
-
timestamp=datetime.
|
|
286
|
+
timestamp=datetime.utcnow(),
|
|
283
287
|
)
|
|
284
288
|
self.mem_scheduler.submit_messages(messages=[message_item])
|
|
285
289
|
|
|
@@ -338,7 +342,7 @@ class MOSCore:
|
|
|
338
342
|
mem_cube=mem_cube,
|
|
339
343
|
label=ANSWER_LABEL,
|
|
340
344
|
content=response,
|
|
341
|
-
timestamp=datetime.
|
|
345
|
+
timestamp=datetime.utcnow(),
|
|
342
346
|
)
|
|
343
347
|
self.mem_scheduler.submit_messages(messages=[message_item])
|
|
344
348
|
|
|
@@ -575,7 +579,13 @@ class MOSCore:
|
|
|
575
579
|
}
|
|
576
580
|
if install_cube_ids is None:
|
|
577
581
|
install_cube_ids = user_cube_ids
|
|
578
|
-
|
|
582
|
+
# create exist dict in mem_cubes and avoid one search slow
|
|
583
|
+
tmp_mem_cubes = {}
|
|
584
|
+
for mem_cube_id in install_cube_ids:
|
|
585
|
+
if mem_cube_id in self.mem_cubes:
|
|
586
|
+
tmp_mem_cubes[mem_cube_id] = self.mem_cubes.get(mem_cube_id)
|
|
587
|
+
|
|
588
|
+
for mem_cube_id, mem_cube in tmp_mem_cubes.items():
|
|
579
589
|
if (
|
|
580
590
|
(mem_cube_id in install_cube_ids)
|
|
581
591
|
and (mem_cube.text_mem is not None)
|
|
@@ -681,7 +691,7 @@ class MOSCore:
|
|
|
681
691
|
mem_cube=mem_cube,
|
|
682
692
|
label=ADD_LABEL,
|
|
683
693
|
content=json.dumps(mem_ids),
|
|
684
|
-
timestamp=datetime.
|
|
694
|
+
timestamp=datetime.utcnow(),
|
|
685
695
|
)
|
|
686
696
|
self.mem_scheduler.submit_messages(messages=[message_item])
|
|
687
697
|
|
|
@@ -725,7 +735,7 @@ class MOSCore:
|
|
|
725
735
|
mem_cube=mem_cube,
|
|
726
736
|
label=ADD_LABEL,
|
|
727
737
|
content=json.dumps(mem_ids),
|
|
728
|
-
timestamp=datetime.
|
|
738
|
+
timestamp=datetime.utcnow(),
|
|
729
739
|
)
|
|
730
740
|
self.mem_scheduler.submit_messages(messages=[message_item])
|
|
731
741
|
|
|
@@ -756,7 +766,7 @@ class MOSCore:
|
|
|
756
766
|
mem_cube=mem_cube,
|
|
757
767
|
label=ADD_LABEL,
|
|
758
768
|
content=json.dumps(mem_ids),
|
|
759
|
-
timestamp=datetime.
|
|
769
|
+
timestamp=datetime.utcnow(),
|
|
760
770
|
)
|
|
761
771
|
self.mem_scheduler.submit_messages(messages=[message_item])
|
|
762
772
|
|
memos/mem_os/main.py
CHANGED
memos/mem_os/product.py
CHANGED
|
@@ -22,7 +22,9 @@ from memos.mem_os.utils.format_utils import (
|
|
|
22
22
|
filter_nodes_by_tree_ids,
|
|
23
23
|
remove_embedding_recursive,
|
|
24
24
|
sort_children_by_memory_type,
|
|
25
|
-
|
|
25
|
+
)
|
|
26
|
+
from memos.mem_os.utils.reference_utils import (
|
|
27
|
+
process_streaming_references_complete,
|
|
26
28
|
)
|
|
27
29
|
from memos.mem_scheduler.schemas.general_schemas import (
|
|
28
30
|
ANSWER_LABEL,
|
|
@@ -406,71 +408,6 @@ class MOSProduct(MOSCore):
|
|
|
406
408
|
return MEMOS_PRODUCT_ENHANCE_PROMPT + personal_memory_context + outer_memory_context
|
|
407
409
|
return MEMOS_PRODUCT_ENHANCE_PROMPT
|
|
408
410
|
|
|
409
|
-
def _process_streaming_references_complete(self, text_buffer: str) -> tuple[str, str]:
|
|
410
|
-
"""
|
|
411
|
-
Complete streaming reference processing to ensure reference tags are never split.
|
|
412
|
-
|
|
413
|
-
Args:
|
|
414
|
-
text_buffer (str): The accumulated text buffer.
|
|
415
|
-
|
|
416
|
-
Returns:
|
|
417
|
-
tuple[str, str]: (processed_text, remaining_buffer)
|
|
418
|
-
"""
|
|
419
|
-
import re
|
|
420
|
-
|
|
421
|
-
# Pattern to match complete reference tags: [refid:memoriesID]
|
|
422
|
-
complete_pattern = r"\[\d+:[^\]]+\]"
|
|
423
|
-
|
|
424
|
-
# Find all complete reference tags
|
|
425
|
-
complete_matches = list(re.finditer(complete_pattern, text_buffer))
|
|
426
|
-
|
|
427
|
-
if complete_matches:
|
|
428
|
-
# Find the last complete tag
|
|
429
|
-
last_match = complete_matches[-1]
|
|
430
|
-
end_pos = last_match.end()
|
|
431
|
-
|
|
432
|
-
# Get text up to the end of the last complete tag
|
|
433
|
-
processed_text = text_buffer[:end_pos]
|
|
434
|
-
remaining_buffer = text_buffer[end_pos:]
|
|
435
|
-
|
|
436
|
-
# Apply reference splitting to the processed text
|
|
437
|
-
processed_text = split_continuous_references(processed_text)
|
|
438
|
-
|
|
439
|
-
return processed_text, remaining_buffer
|
|
440
|
-
|
|
441
|
-
# Check for incomplete reference tags
|
|
442
|
-
# Look for opening bracket with number and colon
|
|
443
|
-
opening_pattern = r"\[\d+:"
|
|
444
|
-
opening_matches = list(re.finditer(opening_pattern, text_buffer))
|
|
445
|
-
|
|
446
|
-
if opening_matches:
|
|
447
|
-
# Find the last opening tag
|
|
448
|
-
last_opening = opening_matches[-1]
|
|
449
|
-
opening_start = last_opening.start()
|
|
450
|
-
|
|
451
|
-
# Check if we have a complete opening pattern
|
|
452
|
-
if last_opening.end() <= len(text_buffer):
|
|
453
|
-
# We have a complete opening pattern, keep everything in buffer
|
|
454
|
-
return "", text_buffer
|
|
455
|
-
else:
|
|
456
|
-
# Incomplete opening pattern, return text before it
|
|
457
|
-
processed_text = text_buffer[:opening_start]
|
|
458
|
-
# Apply reference splitting to the processed text
|
|
459
|
-
processed_text = split_continuous_references(processed_text)
|
|
460
|
-
return processed_text, text_buffer[opening_start:]
|
|
461
|
-
|
|
462
|
-
# Check for partial opening pattern (starts with [ but not complete)
|
|
463
|
-
if "[" in text_buffer:
|
|
464
|
-
ref_start = text_buffer.find("[")
|
|
465
|
-
processed_text = text_buffer[:ref_start]
|
|
466
|
-
# Apply reference splitting to the processed text
|
|
467
|
-
processed_text = split_continuous_references(processed_text)
|
|
468
|
-
return processed_text, text_buffer[ref_start:]
|
|
469
|
-
|
|
470
|
-
# No reference tags found, apply reference splitting and return all text
|
|
471
|
-
processed_text = split_continuous_references(text_buffer)
|
|
472
|
-
return processed_text, ""
|
|
473
|
-
|
|
474
411
|
def _extract_references_from_response(self, response: str) -> tuple[str, list[dict]]:
|
|
475
412
|
"""
|
|
476
413
|
Extract reference information from the response and return clean text.
|
|
@@ -554,7 +491,7 @@ class MOSProduct(MOSCore):
|
|
|
554
491
|
mem_cube=self.mem_cubes[mem_cube_id],
|
|
555
492
|
label=label,
|
|
556
493
|
content=query,
|
|
557
|
-
timestamp=datetime.
|
|
494
|
+
timestamp=datetime.utcnow(),
|
|
558
495
|
)
|
|
559
496
|
self.mem_scheduler.submit_messages(messages=[message_item])
|
|
560
497
|
|
|
@@ -868,7 +805,7 @@ class MOSProduct(MOSCore):
|
|
|
868
805
|
full_response += chunk
|
|
869
806
|
|
|
870
807
|
# Process buffer to ensure complete reference tags
|
|
871
|
-
processed_chunk, remaining_buffer =
|
|
808
|
+
processed_chunk, remaining_buffer = process_streaming_references_complete(buffer)
|
|
872
809
|
|
|
873
810
|
if processed_chunk:
|
|
874
811
|
chunk_data = f"data: {json.dumps({'type': 'text', 'data': processed_chunk}, ensure_ascii=False)}\n\n"
|
|
@@ -877,7 +814,7 @@ class MOSProduct(MOSCore):
|
|
|
877
814
|
|
|
878
815
|
# Process any remaining buffer
|
|
879
816
|
if buffer:
|
|
880
|
-
processed_chunk, remaining_buffer =
|
|
817
|
+
processed_chunk, remaining_buffer = process_streaming_references_complete(buffer)
|
|
881
818
|
if processed_chunk:
|
|
882
819
|
chunk_data = f"data: {json.dumps({'type': 'text', 'data': processed_chunk}, ensure_ascii=False)}\n\n"
|
|
883
820
|
yield chunk_data
|
|
@@ -112,7 +112,7 @@ def get_default_config(
|
|
|
112
112
|
"thread_pool_max_workers": kwargs.get("scheduler_thread_pool_max_workers", 10),
|
|
113
113
|
"consume_interval_seconds": kwargs.get("scheduler_consume_interval_seconds", 3),
|
|
114
114
|
"enable_parallel_dispatch": kwargs.get("scheduler_enable_parallel_dispatch", True),
|
|
115
|
-
"
|
|
115
|
+
"enable_activation_memory": True,
|
|
116
116
|
},
|
|
117
117
|
}
|
|
118
118
|
|
|
@@ -570,15 +570,23 @@ def convert_graph_to_tree_forworkmem(
|
|
|
570
570
|
else:
|
|
571
571
|
other_roots.append(root_id)
|
|
572
572
|
|
|
573
|
-
def build_tree(node_id: str) -> dict[str, Any]:
|
|
574
|
-
"""Recursively build tree structure"""
|
|
573
|
+
def build_tree(node_id: str, visited=None) -> dict[str, Any] | None:
|
|
574
|
+
"""Recursively build tree structure with cycle detection"""
|
|
575
|
+
if visited is None:
|
|
576
|
+
visited = set()
|
|
577
|
+
|
|
578
|
+
if node_id in visited:
|
|
579
|
+
logger.warning(f"[build_tree] Detected cycle at node {node_id}, skipping.")
|
|
580
|
+
return None
|
|
581
|
+
visited.add(node_id)
|
|
582
|
+
|
|
575
583
|
if node_id not in node_map:
|
|
576
584
|
return None
|
|
577
585
|
|
|
578
586
|
children_ids = children_map.get(node_id, [])
|
|
579
587
|
children = []
|
|
580
588
|
for child_id in children_ids:
|
|
581
|
-
child_tree = build_tree(child_id)
|
|
589
|
+
child_tree = build_tree(child_id, visited)
|
|
582
590
|
if child_tree:
|
|
583
591
|
children.append(child_tree)
|
|
584
592
|
|
|
@@ -1355,47 +1363,3 @@ def clean_json_response(response: str) -> str:
|
|
|
1355
1363
|
str: Clean JSON string without markdown formatting
|
|
1356
1364
|
"""
|
|
1357
1365
|
return response.replace("```json", "").replace("```", "").strip()
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
def split_continuous_references(text: str) -> str:
|
|
1361
|
-
"""
|
|
1362
|
-
Split continuous reference tags into individual reference tags.
|
|
1363
|
-
|
|
1364
|
-
Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044]
|
|
1365
|
-
|
|
1366
|
-
Only processes text if:
|
|
1367
|
-
1. '[' appears exactly once
|
|
1368
|
-
2. ']' appears exactly once
|
|
1369
|
-
3. Contains commas between '[' and ']'
|
|
1370
|
-
|
|
1371
|
-
Args:
|
|
1372
|
-
text (str): Text containing reference tags
|
|
1373
|
-
|
|
1374
|
-
Returns:
|
|
1375
|
-
str: Text with split reference tags, or original text if conditions not met
|
|
1376
|
-
"""
|
|
1377
|
-
# Early return if text is empty
|
|
1378
|
-
if not text:
|
|
1379
|
-
return text
|
|
1380
|
-
# Check if '[' appears exactly once
|
|
1381
|
-
if text.count("[") != 1:
|
|
1382
|
-
return text
|
|
1383
|
-
# Check if ']' appears exactly once
|
|
1384
|
-
if text.count("]") != 1:
|
|
1385
|
-
return text
|
|
1386
|
-
# Find positions of brackets
|
|
1387
|
-
open_bracket_pos = text.find("[")
|
|
1388
|
-
close_bracket_pos = text.find("]")
|
|
1389
|
-
|
|
1390
|
-
# Check if brackets are in correct order
|
|
1391
|
-
if open_bracket_pos >= close_bracket_pos:
|
|
1392
|
-
return text
|
|
1393
|
-
# Extract content between brackets
|
|
1394
|
-
content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos]
|
|
1395
|
-
# Check if there's a comma between brackets
|
|
1396
|
-
if "," not in content_between_brackets:
|
|
1397
|
-
return text
|
|
1398
|
-
text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "]["))
|
|
1399
|
-
text = text.replace(content_between_brackets, content_between_brackets.replace(",", "]["))
|
|
1400
|
-
|
|
1401
|
-
return text
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
def split_continuous_references(text: str) -> str:
|
|
2
|
+
"""
|
|
3
|
+
Split continuous reference tags into individual reference tags.
|
|
4
|
+
|
|
5
|
+
Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044]
|
|
6
|
+
|
|
7
|
+
Only processes text if:
|
|
8
|
+
1. '[' appears exactly once
|
|
9
|
+
2. ']' appears exactly once
|
|
10
|
+
3. Contains commas between '[' and ']'
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
text (str): Text containing reference tags
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
str: Text with split reference tags, or original text if conditions not met
|
|
17
|
+
"""
|
|
18
|
+
# Early return if text is empty
|
|
19
|
+
if not text:
|
|
20
|
+
return text
|
|
21
|
+
# Check if '[' appears exactly once
|
|
22
|
+
if text.count("[") != 1:
|
|
23
|
+
return text
|
|
24
|
+
# Check if ']' appears exactly once
|
|
25
|
+
if text.count("]") != 1:
|
|
26
|
+
return text
|
|
27
|
+
# Find positions of brackets
|
|
28
|
+
open_bracket_pos = text.find("[")
|
|
29
|
+
close_bracket_pos = text.find("]")
|
|
30
|
+
|
|
31
|
+
# Check if brackets are in correct order
|
|
32
|
+
if open_bracket_pos >= close_bracket_pos:
|
|
33
|
+
return text
|
|
34
|
+
# Extract content between brackets
|
|
35
|
+
content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos]
|
|
36
|
+
# Check if there's a comma between brackets
|
|
37
|
+
if "," not in content_between_brackets:
|
|
38
|
+
return text
|
|
39
|
+
text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "]["))
|
|
40
|
+
text = text.replace(content_between_brackets, content_between_brackets.replace(",", "]["))
|
|
41
|
+
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def process_streaming_references_complete(text_buffer: str) -> tuple[str, str]:
|
|
46
|
+
"""
|
|
47
|
+
Complete streaming reference processing to ensure reference tags are never split.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
text_buffer (str): The accumulated text buffer.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
tuple[str, str]: (processed_text, remaining_buffer)
|
|
54
|
+
"""
|
|
55
|
+
import re
|
|
56
|
+
|
|
57
|
+
# Pattern to match complete reference tags: [refid:memoriesID]
|
|
58
|
+
complete_pattern = r"\[\d+:[^\]]+\]"
|
|
59
|
+
|
|
60
|
+
# Find all complete reference tags
|
|
61
|
+
complete_matches = list(re.finditer(complete_pattern, text_buffer))
|
|
62
|
+
|
|
63
|
+
if complete_matches:
|
|
64
|
+
# Find the last complete tag
|
|
65
|
+
last_match = complete_matches[-1]
|
|
66
|
+
end_pos = last_match.end()
|
|
67
|
+
|
|
68
|
+
# Check if there's any incomplete reference after the last complete one
|
|
69
|
+
remaining_text = text_buffer[end_pos:]
|
|
70
|
+
|
|
71
|
+
# Look for potential incomplete reference patterns after the last complete tag
|
|
72
|
+
incomplete_pattern = r"\[\d*:?[^\]]*$"
|
|
73
|
+
if re.search(incomplete_pattern, remaining_text):
|
|
74
|
+
# There's a potential incomplete reference, find where it starts
|
|
75
|
+
incomplete_match = re.search(incomplete_pattern, remaining_text)
|
|
76
|
+
if incomplete_match:
|
|
77
|
+
incomplete_start = end_pos + incomplete_match.start()
|
|
78
|
+
processed_text = text_buffer[:incomplete_start]
|
|
79
|
+
remaining_buffer = text_buffer[incomplete_start:]
|
|
80
|
+
|
|
81
|
+
# Apply reference splitting to the processed text
|
|
82
|
+
processed_text = split_continuous_references(processed_text)
|
|
83
|
+
return processed_text, remaining_buffer
|
|
84
|
+
|
|
85
|
+
# No incomplete reference after the last complete tag, process all
|
|
86
|
+
processed_text = split_continuous_references(text_buffer)
|
|
87
|
+
return processed_text, ""
|
|
88
|
+
|
|
89
|
+
# Check for incomplete reference tags - be more specific about what constitutes a potential reference
|
|
90
|
+
# Look for opening bracket with number and colon that could be a reference tag
|
|
91
|
+
opening_pattern = r"\[\d+:"
|
|
92
|
+
opening_matches = list(re.finditer(opening_pattern, text_buffer))
|
|
93
|
+
|
|
94
|
+
if opening_matches:
|
|
95
|
+
# Find the last opening tag
|
|
96
|
+
last_opening = opening_matches[-1]
|
|
97
|
+
opening_start = last_opening.start()
|
|
98
|
+
|
|
99
|
+
# Check if this might be a complete reference tag (has closing bracket after the pattern)
|
|
100
|
+
remaining_text = text_buffer[last_opening.end() :]
|
|
101
|
+
if "]" in remaining_text:
|
|
102
|
+
# This looks like a complete reference tag, process it
|
|
103
|
+
processed_text = split_continuous_references(text_buffer)
|
|
104
|
+
return processed_text, ""
|
|
105
|
+
else:
|
|
106
|
+
# Incomplete reference tag, keep it in buffer
|
|
107
|
+
processed_text = text_buffer[:opening_start]
|
|
108
|
+
processed_text = split_continuous_references(processed_text)
|
|
109
|
+
return processed_text, text_buffer[opening_start:]
|
|
110
|
+
|
|
111
|
+
# More sophisticated check for potential reference patterns
|
|
112
|
+
# Only hold back text if we see a pattern that could be the start of a reference tag
|
|
113
|
+
potential_ref_pattern = r"\[\d*:?$" # Matches [, [1, [12:, etc. at end of buffer
|
|
114
|
+
if re.search(potential_ref_pattern, text_buffer):
|
|
115
|
+
# Find the position of the potential reference start
|
|
116
|
+
match = re.search(potential_ref_pattern, text_buffer)
|
|
117
|
+
if match:
|
|
118
|
+
ref_start = match.start()
|
|
119
|
+
processed_text = text_buffer[:ref_start]
|
|
120
|
+
processed_text = split_continuous_references(processed_text)
|
|
121
|
+
return processed_text, text_buffer[ref_start:]
|
|
122
|
+
|
|
123
|
+
# Check for standalone [ only at the very end of the buffer
|
|
124
|
+
# This prevents cutting off mathematical expressions like [ \Delta U = Q - W ]
|
|
125
|
+
if text_buffer.endswith("["):
|
|
126
|
+
# Only hold back the single [ character
|
|
127
|
+
processed_text = text_buffer[:-1]
|
|
128
|
+
processed_text = split_continuous_references(processed_text)
|
|
129
|
+
return processed_text, "["
|
|
130
|
+
|
|
131
|
+
# No reference-like patterns found, process all text
|
|
132
|
+
processed_text = split_continuous_references(text_buffer)
|
|
133
|
+
return processed_text, ""
|