TeLLMgramBot 3.12.0__tar.gz → 3.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/PKG-INFO +5 -1
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/README.md +4 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/TeLLMgramBot.py +33 -8
- tellmgrambot-3.13.0/TeLLMgramBot/archive.py +321 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/database.py +349 -288
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/initialize.py +23 -8
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/utils.py +11 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/PKG-INFO +5 -1
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/SOURCES.txt +1 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/setup.py +1 -1
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/LICENSE +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/__init__.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/conversation.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/message_handlers.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/models.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/__init__.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/anthropic_provider.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/base.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/factory.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/openai_provider.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/tools.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/web_utils.py +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/dependency_links.txt +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/requires.txt +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/top_level.txt +0 -0
- {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: TeLLMgramBot
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.13.0
|
|
4
4
|
Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
|
|
5
5
|
Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
|
|
6
6
|
Author: Digital Heresy
|
|
@@ -48,6 +48,9 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
48
48
|
* Token limits measure conversation length and determine when to prune oldest messages to stay within model limits.
|
|
49
49
|
* The bot loads the user's full history across all chats up to 50% of the token budget. In private chats, shared group context fills the remaining budget, enabling the bot to reference group conversations from a private context.
|
|
50
50
|
* This eliminates amnesia when switching between private and group chats.
|
|
51
|
+
* Conversation archive preserves long-term context without consuming token budget.
|
|
52
|
+
* Older messages are automatically distilled into concise daily summaries (Tier 1), then progressively compressed into monthly digests (Tier 2). Raw messages are never deleted; archive rows surface seamlessly in search results and context loading.
|
|
53
|
+
* Configurable via `archive_days` (default 60 days before Tier 1 triggers; Tier 2 triggers at 2x this value).
|
|
51
54
|
* Users can manage privacy via two commands:
|
|
52
55
|
* `/forget` - In private chats, clears your full conversation and resets all active sessions. In group chats, removes only your messages and cleans up paired bot replies.
|
|
53
56
|
* `/private` - Toggle private mode (private chats only). When ON, your messages in private chats are excluded from group conversation contexts, enabling selective privacy even in shared groups.
|
|
@@ -152,6 +155,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
|
|
|
152
155
|
- `db_name`: Optional custom database filename without extension (e.g. `MyBot` creates `MyBot.db`); omit for default `conversations.db`. Use distinct names when running multiple bot instances in the same directory.
|
|
153
156
|
- `token_limit`: Max tokens (optional; defaults to model's maximum)
|
|
154
157
|
- `search_limit`: Max search results (optional; defaults to 30)
|
|
158
|
+
- `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
|
|
155
159
|
- `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
|
|
156
160
|
4. **Disable group privacy mode in BotFather:**
|
|
157
161
|
```
|
|
@@ -16,6 +16,9 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
16
16
|
* Token limits measure conversation length and determine when to prune oldest messages to stay within model limits.
|
|
17
17
|
* The bot loads the user's full history across all chats up to 50% of the token budget. In private chats, shared group context fills the remaining budget, enabling the bot to reference group conversations from a private context.
|
|
18
18
|
* This eliminates amnesia when switching between private and group chats.
|
|
19
|
+
* Conversation archive preserves long-term context without consuming token budget.
|
|
20
|
+
* Older messages are automatically distilled into concise daily summaries (Tier 1), then progressively compressed into monthly digests (Tier 2). Raw messages are never deleted; archive rows surface seamlessly in search results and context loading.
|
|
21
|
+
* Configurable via `archive_days` (default 60 days before Tier 1 triggers; Tier 2 triggers at 2x this value).
|
|
19
22
|
* Users can manage privacy via two commands:
|
|
20
23
|
* `/forget` - In private chats, clears your full conversation and resets all active sessions. In group chats, removes only your messages and cleans up paired bot replies.
|
|
21
24
|
* `/private` - Toggle private mode (private chats only). When ON, your messages in private chats are excluded from group conversation contexts, enabling selective privacy even in shared groups.
|
|
@@ -120,6 +123,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
|
|
|
120
123
|
- `db_name`: Optional custom database filename without extension (e.g. `MyBot` creates `MyBot.db`); omit for default `conversations.db`. Use distinct names when running multiple bot instances in the same directory.
|
|
121
124
|
- `token_limit`: Max tokens (optional; defaults to model's maximum)
|
|
122
125
|
- `search_limit`: Max search results (optional; defaults to 30)
|
|
126
|
+
- `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
|
|
123
127
|
- `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
|
|
124
128
|
4. **Disable group privacy mode in BotFather:**
|
|
125
129
|
```
|
|
@@ -22,6 +22,8 @@ from .database import (
|
|
|
22
22
|
delete_messages_for_chat,
|
|
23
23
|
delete_private_messages_for_user,
|
|
24
24
|
delete_bot_replies_for_user,
|
|
25
|
+
delete_archive_for_user,
|
|
26
|
+
delete_archive_for_chat,
|
|
25
27
|
get_shared_group_chat_ids,
|
|
26
28
|
message_id_exists,
|
|
27
29
|
update_message_tg_id,
|
|
@@ -30,6 +32,7 @@ from .database import (
|
|
|
30
32
|
upsert_user,
|
|
31
33
|
wipe_all_data,
|
|
32
34
|
)
|
|
35
|
+
from .archive import run_archival
|
|
33
36
|
from .initialize import (
|
|
34
37
|
INIT_BOT_CONFIG,
|
|
35
38
|
ApiKeyStatus,
|
|
@@ -61,11 +64,12 @@ _SEARCH_TOOL = {
|
|
|
61
64
|
"name": "search_messages",
|
|
62
65
|
"description": (
|
|
63
66
|
"Search the full message history across the user's private chat and shared group chats. "
|
|
67
|
+
"Results include both raw messages and archived summaries of older content. "
|
|
64
68
|
"Use whenever the user asks who said something, what someone said, or what was discussed. "
|
|
65
69
|
"Always search before claiming a person has no message history -- do not assume from context alone. "
|
|
66
70
|
"Run the search immediately when it would help answer the question -- do not ask the user for permission to search. "
|
|
67
71
|
"All filters are optional -- omit them to retrieve recent messages broadly. "
|
|
68
|
-
"Results are ordered most-recent-first
|
|
72
|
+
"Results are ordered most-recent-first by default; use ascending=true for oldest-first."
|
|
69
73
|
),
|
|
70
74
|
"parameters": {
|
|
71
75
|
"type": "object",
|
|
@@ -75,6 +79,7 @@ _SEARCH_TOOL = {
|
|
|
75
79
|
"chat_query": {"type": "string", "description": "Name of the group chat to search within. Use the exact chat title if known. If multiple chats match, the search will return an ambiguity error asking you to clarify."},
|
|
76
80
|
"date_from": {"type": "string", "description": "Start of time range as ISO datetime (YYYY-MM-DDTHH:MM). For a full day, use T00:00."},
|
|
77
81
|
"date_to": {"type": "string", "description": "End of time range as ISO datetime (YYYY-MM-DDTHH:MM). For a full day, use T23:59."},
|
|
82
|
+
"ascending": {"type": "boolean", "description": "If true, results are ordered oldest-first. Use for queries like 'what was the first message about X?' or 'earliest mention of Y'."},
|
|
78
83
|
},
|
|
79
84
|
"required": [],
|
|
80
85
|
},
|
|
@@ -337,10 +342,13 @@ class TelegramBot:
|
|
|
337
342
|
# Wipe bot replies linked to this user, then user's own rows across all chats,
|
|
338
343
|
# then any remaining bot replies in the private chat (pre-migration rows).
|
|
339
344
|
await delete_bot_replies_for_user(user_id)
|
|
345
|
+
await delete_archive_for_user(user_id)
|
|
340
346
|
await delete_messages_for_user(user_id)
|
|
347
|
+
await delete_archive_for_chat(chat_id)
|
|
341
348
|
await delete_messages_for_chat(chat_id)
|
|
342
349
|
else:
|
|
343
350
|
await delete_bot_replies_for_user(user_id)
|
|
351
|
+
await delete_archive_for_user(user_id)
|
|
344
352
|
await delete_messages_for_user(user_id)
|
|
345
353
|
|
|
346
354
|
# Evict only the Conversations that contain this user's data, not all sessions.
|
|
@@ -827,6 +835,7 @@ class TelegramBot:
|
|
|
827
835
|
args.get('date_from'),
|
|
828
836
|
args.get('date_to'),
|
|
829
837
|
self.llm['search_limit'],
|
|
838
|
+
bool(args.get('ascending', False)),
|
|
830
839
|
)
|
|
831
840
|
if isinstance(results, list):
|
|
832
841
|
for r in results:
|
|
@@ -837,6 +846,13 @@ class TelegramBot:
|
|
|
837
846
|
r['timestamp'] = format_dt(dt)
|
|
838
847
|
except ValueError:
|
|
839
848
|
pass
|
|
849
|
+
# Lazy archival trigger: cap hit may indicate more archivable content.
|
|
850
|
+
if len(results) == self.llm['search_limit']:
|
|
851
|
+
try:
|
|
852
|
+
loop = asyncio.get_running_loop()
|
|
853
|
+
loop.create_task(run_archival(self.llm))
|
|
854
|
+
except RuntimeError:
|
|
855
|
+
pass
|
|
840
856
|
return json.dumps(results)
|
|
841
857
|
|
|
842
858
|
tool_def = self.webhook_defs.get(tool_call.name)
|
|
@@ -943,6 +959,7 @@ class TelegramBot:
|
|
|
943
959
|
token_limit = INIT_BOT_CONFIG['token_limit'],
|
|
944
960
|
search_limit = INIT_BOT_CONFIG['search_limit'],
|
|
945
961
|
persona_temp = INIT_BOT_CONFIG['persona_temp'],
|
|
962
|
+
archive_days = INIT_BOT_CONFIG['archive_days'],
|
|
946
963
|
persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
|
|
947
964
|
key_status: ApiKeyStatus | None = None,
|
|
948
965
|
log_name: str = 'tellmgrambot',
|
|
@@ -964,6 +981,9 @@ class TelegramBot:
|
|
|
964
981
|
persona_temp: LLM temperature (0.0-2.0). If None, defaults to 1.0.
|
|
965
982
|
persona_prompt: System prompt defining the bot's behavior and personality.
|
|
966
983
|
key_status: ApiKeyStatus object indicating available features. If None, calls init_structure().
|
|
984
|
+
archive_days: Days before messages are eligible for Tier 1 archival (default: 60).
|
|
985
|
+
Must be an integer >= 1; invalid values log a warning and fall back to 60.
|
|
986
|
+
Tier 2 compression triggers at archive_days * 2.
|
|
967
987
|
webhook_schemas: Provider-compatible tool schema dicts for webhook tools (from build_tool_registry).
|
|
968
988
|
If None, no webhook tools are registered.
|
|
969
989
|
webhook_defs: Resolved webhook tool definitions keyed by tool name (from build_tool_registry).
|
|
@@ -1033,17 +1053,21 @@ class TelegramBot:
|
|
|
1033
1053
|
if persona_temp is not None and not (isinstance(persona_temp, (int, float)) and 0.0 <= persona_temp <= 2.0):
|
|
1034
1054
|
logger.warning(f"Invalid persona_temp '{persona_temp}' (must be a decimal between 0.0 and 2.0), using default 1.0")
|
|
1035
1055
|
persona_temp = None
|
|
1056
|
+
if archive_days is not None and not (isinstance(archive_days, int) and archive_days >= 1):
|
|
1057
|
+
logger.warning(f"Invalid archive_days '{archive_days}' (must be an integer >= 1), using default 60")
|
|
1058
|
+
archive_days = None
|
|
1036
1059
|
|
|
1037
1060
|
# Get our LLM spun up with defaults if not defined by user input
|
|
1038
1061
|
# Tokens as integers measure the length of conversation messages
|
|
1039
1062
|
self.llm = {
|
|
1040
|
-
'prompt'
|
|
1041
|
-
'chat_model'
|
|
1042
|
-
'url_model'
|
|
1043
|
-
'token_limit'
|
|
1044
|
-
'search_limit': search_limit or 30,
|
|
1045
|
-
'temperature'
|
|
1046
|
-
'top_p'
|
|
1063
|
+
'prompt' : persona_prompt,
|
|
1064
|
+
'chat_model' : chat_model,
|
|
1065
|
+
'url_model' : url_model,
|
|
1066
|
+
'token_limit' : token_limit or TokenLimits(chat_model).max_tokens(),
|
|
1067
|
+
'search_limit' : search_limit or 30,
|
|
1068
|
+
'temperature' : persona_temp or 1.0,
|
|
1069
|
+
'top_p' : 0.9,
|
|
1070
|
+
'archive_days' : archive_days if archive_days is not None else 60,
|
|
1047
1071
|
}
|
|
1048
1072
|
# Set a rounded-down integer to prune a lengthy conversation by 500 tokens
|
|
1049
1073
|
# Note if the upper limit is below 500, the lower limit is set to 0
|
|
@@ -1117,6 +1141,7 @@ class TelegramBot:
|
|
|
1117
1141
|
token_limit = config['token_limit'],
|
|
1118
1142
|
search_limit = config['search_limit'],
|
|
1119
1143
|
persona_temp = config['persona_temp'],
|
|
1144
|
+
archive_days = config['archive_days'],
|
|
1120
1145
|
persona_prompt = prompt,
|
|
1121
1146
|
key_status = key_status,
|
|
1122
1147
|
log_name = log_name,
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Two-tier conversation archive for TeLLMgramBot.
|
|
3
|
+
|
|
4
|
+
Tier 1: Key fact extraction - batches of old messages distilled into concise statements,
|
|
5
|
+
grouped by chat + day. Private chats produce single-user rows; group chats produce
|
|
6
|
+
multi-speaker rows with a participants JSON array of contributing user_ids.
|
|
7
|
+
|
|
8
|
+
Tier 2: Episodic summarization - old Tier 1 rows compressed into thematic digests,
|
|
9
|
+
grouped by chat + month.
|
|
10
|
+
|
|
11
|
+
Raw messages are never deleted; archived_at flags rows to skip during context loading.
|
|
12
|
+
Search still hits raw rows regardless of archived_at.
|
|
13
|
+
"""
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
import aiosqlite
|
|
18
|
+
|
|
19
|
+
from .database import get_db_path
|
|
20
|
+
from .providers.factory import get_provider
|
|
21
|
+
from .utils import cutoff_iso, now_iso
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
_archival_running = False
|
|
26
|
+
|
|
27
|
+
_TIER1_PROMPT = (
|
|
28
|
+
"Extract key facts from this conversation. "
|
|
29
|
+
"Ignore greetings, acknowledgments, and filler. "
|
|
30
|
+
"Return one concise factual statement per line, no numbering. "
|
|
31
|
+
"Only include meaningful, specific information. "
|
|
32
|
+
"Keep each statement under 20 words.\n\n"
|
|
33
|
+
"Conversation:\n{conversation}"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
_TIER2_PROMPT = (
|
|
37
|
+
"Summarize these key facts into a concise thematic digest. "
|
|
38
|
+
"Group related facts together into 2-5 sentences. "
|
|
39
|
+
"Be specific, not generic. Do not use bullet points.\n\n"
|
|
40
|
+
"Key facts:\n{facts}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def run_archival(config: dict) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Run Tier 1 and Tier 2 archival passes. No-op if already running.
|
|
47
|
+
|
|
48
|
+
Called at bot startup and lazily when search_messages hits the result cap.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
config: Dict with keys: chat_model, and optionally archive_days.
|
|
52
|
+
"""
|
|
53
|
+
global _archival_running
|
|
54
|
+
if _archival_running:
|
|
55
|
+
return
|
|
56
|
+
_archival_running = True
|
|
57
|
+
try:
|
|
58
|
+
await _run_tier1(config)
|
|
59
|
+
await _run_tier2(config)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"Archival run failed: {e}", exc_info=True)
|
|
62
|
+
finally:
|
|
63
|
+
_archival_running = False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_model(config: dict) -> str:
|
|
67
|
+
"""Return the chat_model from config, or empty string if not set."""
|
|
68
|
+
return config.get('chat_model', '')
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_archive_days(config: dict) -> int:
|
|
72
|
+
"""Return validated archive_days from config, falling back to 60 on invalid values."""
|
|
73
|
+
_ad = config.get('archive_days')
|
|
74
|
+
if _ad is not None:
|
|
75
|
+
try:
|
|
76
|
+
days = int(_ad)
|
|
77
|
+
if days >= 1:
|
|
78
|
+
return days
|
|
79
|
+
except (TypeError, ValueError):
|
|
80
|
+
pass
|
|
81
|
+
logger.warning(f"ARCHIVE: invalid archive_days '{_ad}', using default 60")
|
|
82
|
+
return 60
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _fmt_ts(ts: str) -> str:
|
|
86
|
+
if not ts:
|
|
87
|
+
return ''
|
|
88
|
+
return ts[:16].replace('T', ' ') + ' UTC'
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def _run_tier1(config: dict) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Extract key facts from messages older than archive_days into Tier 1 rows.
|
|
94
|
+
|
|
95
|
+
Groups old messages by chat and day, batches each group through the LLM with
|
|
96
|
+
_TIER1_PROMPT, stores the extracted facts as summary_archive rows, and flags
|
|
97
|
+
source rows with archived_at. Logs warnings on batch failures but continues
|
|
98
|
+
processing other batches.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
config: Dict with keys: chat_model, and optionally archive_days.
|
|
102
|
+
"""
|
|
103
|
+
model = _get_model(config)
|
|
104
|
+
if not model:
|
|
105
|
+
logger.warning("ARCHIVE: no model configured, skipping Tier 1")
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
after_days = _get_archive_days(config)
|
|
109
|
+
|
|
110
|
+
async with aiosqlite.connect(get_db_path()) as db:
|
|
111
|
+
cursor = await db.execute(
|
|
112
|
+
"""
|
|
113
|
+
SELECT m.id, m.chat_id, m.user_id, m.role, m.content, m.created_at,
|
|
114
|
+
u.first_name, u.username
|
|
115
|
+
FROM messages m
|
|
116
|
+
LEFT JOIN users u ON m.user_id = u.user_id
|
|
117
|
+
WHERE m.archived_at IS NULL AND m.is_private = 0
|
|
118
|
+
AND m.created_at < ?
|
|
119
|
+
ORDER BY m.chat_id, date(m.created_at), m.created_at ASC, m.id ASC
|
|
120
|
+
""",
|
|
121
|
+
(cutoff_iso(after_days),),
|
|
122
|
+
)
|
|
123
|
+
raw_rows = await cursor.fetchall()
|
|
124
|
+
|
|
125
|
+
if not raw_rows:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
batches: dict[tuple, list] = {}
|
|
129
|
+
for row in raw_rows:
|
|
130
|
+
key = (row[1], row[5][:10]) # (chat_id, YYYY-MM-DD)
|
|
131
|
+
batches.setdefault(key, []).append(row)
|
|
132
|
+
|
|
133
|
+
provider = get_provider(model)
|
|
134
|
+
for (chat_id, day), batch in batches.items():
|
|
135
|
+
try:
|
|
136
|
+
await _process_tier1_batch(provider, model, chat_id, day, batch)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.warning(f"ARCHIVE: Tier 1 batch {chat_id}/{day} failed: {e}")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
async def _process_tier1_batch(provider, model: str, chat_id: int, day: str, rows: list) -> None:
|
|
142
|
+
"""
|
|
143
|
+
Extract key facts from a single day's batch of messages via LLM.
|
|
144
|
+
|
|
145
|
+
Formats messages with speaker/timestamp annotations, calls the provider with
|
|
146
|
+
_TIER1_PROMPT, inserts the result as a summary_archive row (tier 1), and marks
|
|
147
|
+
source rows with archived_at. For private chats (chat_id > 0), stores as a
|
|
148
|
+
single-user row; for groups, extracts participant user_ids and stores as
|
|
149
|
+
multi-speaker row with participants JSON array.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
provider: LLM provider instance (e.g., from get_provider).
|
|
153
|
+
model: Model name to pass to provider.complete().
|
|
154
|
+
chat_id: Telegram chat ID.
|
|
155
|
+
day: YYYY-MM-DD date string (used for logging).
|
|
156
|
+
rows: List of message tuples from the database query.
|
|
157
|
+
"""
|
|
158
|
+
# rows: (id, chat_id, user_id, role, content, created_at, first_name, username)
|
|
159
|
+
lines = []
|
|
160
|
+
for row in rows:
|
|
161
|
+
if row[3] == 'assistant':
|
|
162
|
+
speaker = 'Assistant'
|
|
163
|
+
else:
|
|
164
|
+
first_name, username = row[6], row[7]
|
|
165
|
+
speaker = first_name or (f"@{username}" if username else f"User {row[2]}")
|
|
166
|
+
lines.append(f"[{speaker}, {_fmt_ts(row[5])}]: {row[4]}")
|
|
167
|
+
|
|
168
|
+
conversation = '\n'.join(lines)
|
|
169
|
+
messages = [{"role": "user", "content": _TIER1_PROMPT.format(conversation=conversation)}]
|
|
170
|
+
|
|
171
|
+
result = await provider.complete(model, messages)
|
|
172
|
+
if not isinstance(result, str) or not result.strip():
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
user_ids = sorted({row[2] for row in rows if row[3] == 'user'})
|
|
176
|
+
is_private = chat_id > 0
|
|
177
|
+
if is_private and len(user_ids) == 1:
|
|
178
|
+
archive_user_id = user_ids[0]
|
|
179
|
+
participants_json = None
|
|
180
|
+
else:
|
|
181
|
+
archive_user_id = None
|
|
182
|
+
participants_json = json.dumps(user_ids) if user_ids else None
|
|
183
|
+
|
|
184
|
+
covers_from = rows[0][5]
|
|
185
|
+
covers_to = rows[-1][5]
|
|
186
|
+
msg_ids = [row[0] for row in rows]
|
|
187
|
+
|
|
188
|
+
async with aiosqlite.connect(get_db_path()) as db:
|
|
189
|
+
await db.execute(
|
|
190
|
+
"INSERT INTO summary_archive "
|
|
191
|
+
"(chat_id, user_id, participants, tier, content, covers_from, covers_to) "
|
|
192
|
+
"VALUES (?, ?, ?, 1, ?, ?, ?)",
|
|
193
|
+
(chat_id, archive_user_id, participants_json, result.strip(), covers_from, covers_to),
|
|
194
|
+
)
|
|
195
|
+
placeholders = ','.join('?' * len(msg_ids))
|
|
196
|
+
await db.execute(
|
|
197
|
+
f"UPDATE messages SET archived_at = ? WHERE id IN ({placeholders})",
|
|
198
|
+
[now_iso()] + msg_ids,
|
|
199
|
+
)
|
|
200
|
+
await db.commit()
|
|
201
|
+
|
|
202
|
+
logger.info(
|
|
203
|
+
f"ARCHIVE: Tier 1 stored for chat {chat_id} day {day} "
|
|
204
|
+
f"({len(rows)} messages -> {len(result.splitlines())} facts)"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
async def _run_tier2(config: dict) -> None:
|
|
209
|
+
"""
|
|
210
|
+
Compress old Tier 1 rows into Tier 2 (episodic) summaries.
|
|
211
|
+
|
|
212
|
+
Groups Tier 1 rows older than archive_days * 2 by chat and month, batches each
|
|
213
|
+
group through the LLM with _TIER2_PROMPT, stores the result as a summary_archive
|
|
214
|
+
row (tier 2), and flags source Tier 1 rows with archived_at. Logs warnings on
|
|
215
|
+
batch failures but continues processing other batches.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
config: Dict with keys: chat_model, and optionally archive_days.
|
|
219
|
+
"""
|
|
220
|
+
model = _get_model(config)
|
|
221
|
+
if not model:
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
after_days = _get_archive_days(config)
|
|
225
|
+
episode_days = after_days * 2
|
|
226
|
+
|
|
227
|
+
async with aiosqlite.connect(get_db_path()) as db:
|
|
228
|
+
cursor = await db.execute(
|
|
229
|
+
"""
|
|
230
|
+
SELECT id, chat_id, user_id, participants, content, covers_from, covers_to
|
|
231
|
+
FROM summary_archive
|
|
232
|
+
WHERE tier = 1 AND archived_at IS NULL
|
|
233
|
+
AND covers_to < ?
|
|
234
|
+
ORDER BY chat_id, strftime('%Y-%m', covers_from), covers_from ASC
|
|
235
|
+
""",
|
|
236
|
+
(cutoff_iso(episode_days),),
|
|
237
|
+
)
|
|
238
|
+
tier1_rows = await cursor.fetchall()
|
|
239
|
+
|
|
240
|
+
if not tier1_rows:
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
batches: dict[tuple, list] = {}
|
|
244
|
+
for row in tier1_rows:
|
|
245
|
+
key = (row[1], row[5][:7]) # (chat_id, YYYY-MM)
|
|
246
|
+
batches.setdefault(key, []).append(row)
|
|
247
|
+
|
|
248
|
+
provider = get_provider(model)
|
|
249
|
+
for (chat_id, month), batch in batches.items():
|
|
250
|
+
try:
|
|
251
|
+
await _process_tier2_batch(provider, model, chat_id, month, batch)
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.warning(f"ARCHIVE: Tier 2 batch {chat_id}/{month} failed: {e}")
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
async def _process_tier2_batch(provider, model: str, chat_id: int, month: str, rows: list) -> None:
|
|
257
|
+
"""
|
|
258
|
+
Compress a month's worth of Tier 1 summaries into a single thematic digest.
|
|
259
|
+
|
|
260
|
+
Concatenates Tier 1 facts, calls the provider with _TIER2_PROMPT, inserts the
|
|
261
|
+
result as a summary_archive row (tier 2), and marks source Tier 1 rows with
|
|
262
|
+
archived_at. Merges participants from all source rows into a single JSON array
|
|
263
|
+
for the Tier 2 row (unless all rows are attributed to a single user, in which
|
|
264
|
+
case stores as single-user row).
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
provider: LLM provider instance (e.g., from get_provider).
|
|
268
|
+
model: Model name to pass to provider.complete().
|
|
269
|
+
chat_id: Telegram chat ID.
|
|
270
|
+
month: YYYY-MM month string (used for logging).
|
|
271
|
+
rows: List of Tier 1 archive tuples from the database query.
|
|
272
|
+
"""
|
|
273
|
+
# rows: (id, chat_id, user_id, participants, content, covers_from, covers_to)
|
|
274
|
+
facts = '\n'.join(row[4] for row in rows)
|
|
275
|
+
messages = [{"role": "user", "content": _TIER2_PROMPT.format(facts=facts)}]
|
|
276
|
+
|
|
277
|
+
result = await provider.complete(model, messages)
|
|
278
|
+
if not isinstance(result, str) or not result.strip():
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
all_user_ids: set[int] = set()
|
|
282
|
+
for row in rows:
|
|
283
|
+
if row[2] is not None:
|
|
284
|
+
all_user_ids.add(row[2])
|
|
285
|
+
if row[3]:
|
|
286
|
+
try:
|
|
287
|
+
all_user_ids.update(json.loads(row[3]))
|
|
288
|
+
except (json.JSONDecodeError, TypeError):
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
unique_attributed = {row[2] for row in rows if row[2] is not None}
|
|
292
|
+
has_multi_speaker = any(row[3] for row in rows)
|
|
293
|
+
if len(unique_attributed) == 1 and not has_multi_speaker:
|
|
294
|
+
archive_user_id = next(iter(unique_attributed))
|
|
295
|
+
participants_json = None
|
|
296
|
+
else:
|
|
297
|
+
archive_user_id = None
|
|
298
|
+
participants_json = json.dumps(sorted(all_user_ids)) if all_user_ids else None
|
|
299
|
+
|
|
300
|
+
covers_from = rows[0][5]
|
|
301
|
+
covers_to = rows[-1][6]
|
|
302
|
+
source_ids = [row[0] for row in rows]
|
|
303
|
+
|
|
304
|
+
async with aiosqlite.connect(get_db_path()) as db:
|
|
305
|
+
await db.execute(
|
|
306
|
+
"INSERT INTO summary_archive "
|
|
307
|
+
"(chat_id, user_id, participants, tier, content, covers_from, covers_to) "
|
|
308
|
+
"VALUES (?, ?, ?, 2, ?, ?, ?)",
|
|
309
|
+
(chat_id, archive_user_id, participants_json, result.strip(), covers_from, covers_to),
|
|
310
|
+
)
|
|
311
|
+
placeholders = ','.join('?' * len(source_ids))
|
|
312
|
+
await db.execute(
|
|
313
|
+
f"UPDATE summary_archive SET archived_at = ? WHERE id IN ({placeholders})",
|
|
314
|
+
[now_iso()] + source_ids,
|
|
315
|
+
)
|
|
316
|
+
await db.commit()
|
|
317
|
+
|
|
318
|
+
logger.info(
|
|
319
|
+
f"ARCHIVE: Tier 2 stored for chat {chat_id} month {month} "
|
|
320
|
+
f"({len(rows)} Tier 1 rows -> 1 episode)"
|
|
321
|
+
)
|