TeLLMgramBot 3.12.0__tar.gz → 3.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/PKG-INFO +5 -1
  2. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/README.md +4 -0
  3. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/TeLLMgramBot.py +33 -8
  4. tellmgrambot-3.13.0/TeLLMgramBot/archive.py +321 -0
  5. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/database.py +349 -288
  6. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/initialize.py +23 -8
  7. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/utils.py +11 -0
  8. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/PKG-INFO +5 -1
  9. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/SOURCES.txt +1 -0
  10. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/setup.py +1 -1
  11. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/LICENSE +0 -0
  12. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/__init__.py +0 -0
  13. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/conversation.py +0 -0
  14. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/message_handlers.py +0 -0
  15. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/models.py +0 -0
  16. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/__init__.py +0 -0
  17. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/anthropic_provider.py +0 -0
  18. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/base.py +0 -0
  19. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/factory.py +0 -0
  20. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/providers/openai_provider.py +0 -0
  21. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/tools.py +0 -0
  22. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot/web_utils.py +0 -0
  23. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/dependency_links.txt +0 -0
  24. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/requires.txt +0 -0
  25. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/TeLLMgramBot.egg-info/top_level.txt +0 -0
  26. {tellmgrambot-3.12.0 → tellmgrambot-3.13.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TeLLMgramBot
3
- Version: 3.12.0
3
+ Version: 3.13.0
4
4
  Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
5
5
  Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
6
6
  Author: Digital Heresy
@@ -48,6 +48,9 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
48
48
  * Token limits measure conversation length and determine when to prune oldest messages to stay within model limits.
49
49
  * The bot loads the user's full history across all chats up to 50% of the token budget. In private chats, shared group context fills the remaining budget, enabling the bot to reference group conversations from a private context.
50
50
  * This eliminates amnesia when switching between private and group chats.
51
+ * Conversation archive preserves long-term context without consuming token budget.
52
+ * Older messages are automatically distilled into concise daily summaries (Tier 1), then progressively compressed into monthly digests (Tier 2). Raw messages are never deleted; archive rows surface seamlessly in search results and context loading.
53
+ * Configurable via `archive_days` (default 60 days before Tier 1 triggers; Tier 2 triggers at 2x this value).
51
54
  * Users can manage privacy via two commands:
52
55
  * `/forget` - In private chats, clears your full conversation and resets all active sessions. In group chats, removes only your messages and cleans up paired bot replies.
53
56
  * `/private` - Toggle private mode (private chats only). When ON, your messages in private chats are excluded from group conversation contexts, enabling selective privacy even in shared groups.
@@ -152,6 +155,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
152
155
  - `db_name`: Optional custom database filename without extension (e.g. `MyBot` creates `MyBot.db`); omit for default `conversations.db`. Use distinct names when running multiple bot instances in the same directory.
153
156
  - `token_limit`: Max tokens (optional; defaults to model's maximum)
154
157
  - `search_limit`: Max search results (optional; defaults to 30)
158
+ - `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
155
159
  - `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
156
160
  4. **Disable group privacy mode in BotFather:**
157
161
  ```
@@ -16,6 +16,9 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
16
16
  * Token limits measure conversation length and determine when to prune oldest messages to stay within model limits.
17
17
  * The bot loads the user's full history across all chats up to 50% of the token budget. In private chats, shared group context fills the remaining budget, enabling the bot to reference group conversations from a private context.
18
18
  * This eliminates amnesia when switching between private and group chats.
19
+ * Conversation archive preserves long-term context without consuming token budget.
20
+ * Older messages are automatically distilled into concise daily summaries (Tier 1), then progressively compressed into monthly digests (Tier 2). Raw messages are never deleted; archive rows surface seamlessly in search results and context loading.
21
+ * Configurable via `archive_days` (default 60 days before Tier 1 triggers; Tier 2 triggers at 2x this value).
19
22
  * Users can manage privacy via two commands:
20
23
  * `/forget` - In private chats, clears your full conversation and resets all active sessions. In group chats, removes only your messages and cleans up paired bot replies.
21
24
  * `/private` - Toggle private mode (private chats only). When ON, your messages in private chats are excluded from group conversation contexts, enabling selective privacy even in shared groups.
@@ -120,6 +123,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
120
123
  - `db_name`: Optional custom database filename without extension (e.g. `MyBot` creates `MyBot.db`); omit for default `conversations.db`. Use distinct names when running multiple bot instances in the same directory.
121
124
  - `token_limit`: Max tokens (optional; defaults to model's maximum)
122
125
  - `search_limit`: Max search results (optional; defaults to 30)
126
+ - `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
123
127
  - `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
124
128
  4. **Disable group privacy mode in BotFather:**
125
129
  ```
@@ -22,6 +22,8 @@ from .database import (
22
22
  delete_messages_for_chat,
23
23
  delete_private_messages_for_user,
24
24
  delete_bot_replies_for_user,
25
+ delete_archive_for_user,
26
+ delete_archive_for_chat,
25
27
  get_shared_group_chat_ids,
26
28
  message_id_exists,
27
29
  update_message_tg_id,
@@ -30,6 +32,7 @@ from .database import (
30
32
  upsert_user,
31
33
  wipe_all_data,
32
34
  )
35
+ from .archive import run_archival
33
36
  from .initialize import (
34
37
  INIT_BOT_CONFIG,
35
38
  ApiKeyStatus,
@@ -61,11 +64,12 @@ _SEARCH_TOOL = {
61
64
  "name": "search_messages",
62
65
  "description": (
63
66
  "Search the full message history across the user's private chat and shared group chats. "
67
+ "Results include both raw messages and archived summaries of older content. "
64
68
  "Use whenever the user asks who said something, what someone said, or what was discussed. "
65
69
  "Always search before claiming a person has no message history -- do not assume from context alone. "
66
70
  "Run the search immediately when it would help answer the question -- do not ask the user for permission to search. "
67
71
  "All filters are optional -- omit them to retrieve recent messages broadly. "
68
- "Results are ordered most-recent-first; to find the earliest message, look at the last result."
72
+ "Results are ordered most-recent-first by default; use ascending=true for oldest-first."
69
73
  ),
70
74
  "parameters": {
71
75
  "type": "object",
@@ -75,6 +79,7 @@ _SEARCH_TOOL = {
75
79
  "chat_query": {"type": "string", "description": "Name of the group chat to search within. Use the exact chat title if known. If multiple chats match, the search will return an ambiguity error asking you to clarify."},
76
80
  "date_from": {"type": "string", "description": "Start of time range as ISO datetime (YYYY-MM-DDTHH:MM). For a full day, use T00:00."},
77
81
  "date_to": {"type": "string", "description": "End of time range as ISO datetime (YYYY-MM-DDTHH:MM). For a full day, use T23:59."},
82
+ "ascending": {"type": "boolean", "description": "If true, results are ordered oldest-first. Use for queries like 'what was the first message about X?' or 'earliest mention of Y'."},
78
83
  },
79
84
  "required": [],
80
85
  },
@@ -337,10 +342,13 @@ class TelegramBot:
337
342
  # Wipe bot replies linked to this user, then user's own rows across all chats,
338
343
  # then any remaining bot replies in the private chat (pre-migration rows).
339
344
  await delete_bot_replies_for_user(user_id)
345
+ await delete_archive_for_user(user_id)
340
346
  await delete_messages_for_user(user_id)
347
+ await delete_archive_for_chat(chat_id)
341
348
  await delete_messages_for_chat(chat_id)
342
349
  else:
343
350
  await delete_bot_replies_for_user(user_id)
351
+ await delete_archive_for_user(user_id)
344
352
  await delete_messages_for_user(user_id)
345
353
 
346
354
  # Evict only the Conversations that contain this user's data, not all sessions.
@@ -827,6 +835,7 @@ class TelegramBot:
827
835
  args.get('date_from'),
828
836
  args.get('date_to'),
829
837
  self.llm['search_limit'],
838
+ bool(args.get('ascending', False)),
830
839
  )
831
840
  if isinstance(results, list):
832
841
  for r in results:
@@ -837,6 +846,13 @@ class TelegramBot:
837
846
  r['timestamp'] = format_dt(dt)
838
847
  except ValueError:
839
848
  pass
849
+ # Lazy archival trigger: cap hit may indicate more archivable content.
850
+ if len(results) == self.llm['search_limit']:
851
+ try:
852
+ loop = asyncio.get_running_loop()
853
+ loop.create_task(run_archival(self.llm))
854
+ except RuntimeError:
855
+ pass
840
856
  return json.dumps(results)
841
857
 
842
858
  tool_def = self.webhook_defs.get(tool_call.name)
@@ -943,6 +959,7 @@ class TelegramBot:
943
959
  token_limit = INIT_BOT_CONFIG['token_limit'],
944
960
  search_limit = INIT_BOT_CONFIG['search_limit'],
945
961
  persona_temp = INIT_BOT_CONFIG['persona_temp'],
962
+ archive_days = INIT_BOT_CONFIG['archive_days'],
946
963
  persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
947
964
  key_status: ApiKeyStatus | None = None,
948
965
  log_name: str = 'tellmgrambot',
@@ -964,6 +981,9 @@ class TelegramBot:
964
981
  persona_temp: LLM temperature (0.0-2.0). If None, defaults to 1.0.
965
982
  persona_prompt: System prompt defining the bot's behavior and personality.
966
983
  key_status: ApiKeyStatus object indicating available features. If None, calls init_structure().
984
+ archive_days: Days before messages are eligible for Tier 1 archival (default: 60).
985
+ Must be an integer >= 1; invalid values log a warning and fall back to 60.
986
+ Tier 2 compression triggers at archive_days * 2.
967
987
  webhook_schemas: Provider-compatible tool schema dicts for webhook tools (from build_tool_registry).
968
988
  If None, no webhook tools are registered.
969
989
  webhook_defs: Resolved webhook tool definitions keyed by tool name (from build_tool_registry).
@@ -1033,17 +1053,21 @@ class TelegramBot:
1033
1053
  if persona_temp is not None and not (isinstance(persona_temp, (int, float)) and 0.0 <= persona_temp <= 2.0):
1034
1054
  logger.warning(f"Invalid persona_temp '{persona_temp}' (must be a decimal between 0.0 and 2.0), using default 1.0")
1035
1055
  persona_temp = None
1056
+ if archive_days is not None and not (isinstance(archive_days, int) and archive_days >= 1):
1057
+ logger.warning(f"Invalid archive_days '{archive_days}' (must be an integer >= 1), using default 60")
1058
+ archive_days = None
1036
1059
 
1037
1060
  # Get our LLM spun up with defaults if not defined by user input
1038
1061
  # Tokens as integers measure the length of conversation messages
1039
1062
  self.llm = {
1040
- 'prompt' : persona_prompt,
1041
- 'chat_model' : chat_model,
1042
- 'url_model' : url_model,
1043
- 'token_limit' : token_limit or TokenLimits(chat_model).max_tokens(),
1044
- 'search_limit': search_limit or 30,
1045
- 'temperature' : persona_temp or 1.0,
1046
- 'top_p' : 0.9
1063
+ 'prompt' : persona_prompt,
1064
+ 'chat_model' : chat_model,
1065
+ 'url_model' : url_model,
1066
+ 'token_limit' : token_limit or TokenLimits(chat_model).max_tokens(),
1067
+ 'search_limit' : search_limit or 30,
1068
+ 'temperature' : persona_temp or 1.0,
1069
+ 'top_p' : 0.9,
1070
+ 'archive_days' : archive_days if archive_days is not None else 60,
1047
1071
  }
1048
1072
  # Set a rounded-down integer to prune a lengthy conversation by 500 tokens
1049
1073
  # Note if the upper limit is below 500, the lower limit is set to 0
@@ -1117,6 +1141,7 @@ class TelegramBot:
1117
1141
  token_limit = config['token_limit'],
1118
1142
  search_limit = config['search_limit'],
1119
1143
  persona_temp = config['persona_temp'],
1144
+ archive_days = config['archive_days'],
1120
1145
  persona_prompt = prompt,
1121
1146
  key_status = key_status,
1122
1147
  log_name = log_name,
@@ -0,0 +1,321 @@
1
+ """
2
+ Two-tier conversation archive for TeLLMgramBot.
3
+
4
+ Tier 1: Key fact extraction - batches of old messages distilled into concise statements,
5
+ grouped by chat + day. Private chats produce single-user rows; group chats produce
6
+ multi-speaker rows with a participants JSON array of contributing user_ids.
7
+
8
+ Tier 2: Episodic summarization - old Tier 1 rows compressed into thematic digests,
9
+ grouped by chat + month.
10
+
11
+ Raw messages are never deleted; archived_at flags rows to skip during context loading.
12
+ Search still hits raw rows regardless of archived_at.
13
+ """
14
+ import json
15
+ import logging
16
+
17
+ import aiosqlite
18
+
19
+ from .database import get_db_path
20
+ from .providers.factory import get_provider
21
+ from .utils import cutoff_iso, now_iso
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ _archival_running = False
26
+
27
+ _TIER1_PROMPT = (
28
+ "Extract key facts from this conversation. "
29
+ "Ignore greetings, acknowledgments, and filler. "
30
+ "Return one concise factual statement per line, no numbering. "
31
+ "Only include meaningful, specific information. "
32
+ "Keep each statement under 20 words.\n\n"
33
+ "Conversation:\n{conversation}"
34
+ )
35
+
36
+ _TIER2_PROMPT = (
37
+ "Summarize these key facts into a concise thematic digest. "
38
+ "Group related facts together into 2-5 sentences. "
39
+ "Be specific, not generic. Do not use bullet points.\n\n"
40
+ "Key facts:\n{facts}"
41
+ )
42
+
43
+
44
+ async def run_archival(config: dict) -> None:
45
+ """
46
+ Run Tier 1 and Tier 2 archival passes. No-op if already running.
47
+
48
+ Called at bot startup and lazily when search_messages hits the result cap.
49
+
50
+ Args:
51
+ config: Dict with keys: chat_model, and optionally archive_days.
52
+ """
53
+ global _archival_running
54
+ if _archival_running:
55
+ return
56
+ _archival_running = True
57
+ try:
58
+ await _run_tier1(config)
59
+ await _run_tier2(config)
60
+ except Exception as e:
61
+ logger.error(f"Archival run failed: {e}", exc_info=True)
62
+ finally:
63
+ _archival_running = False
64
+
65
+
66
+ def _get_model(config: dict) -> str:
67
+ """Return the chat_model from config, or empty string if not set."""
68
+ return config.get('chat_model', '')
69
+
70
+
71
+ def _get_archive_days(config: dict) -> int:
72
+ """Return validated archive_days from config, falling back to 60 on invalid values."""
73
+ _ad = config.get('archive_days')
74
+ if _ad is not None:
75
+ try:
76
+ days = int(_ad)
77
+ if days >= 1:
78
+ return days
79
+ except (TypeError, ValueError):
80
+ pass
81
+ logger.warning(f"ARCHIVE: invalid archive_days '{_ad}', using default 60")
82
+ return 60
83
+
84
+
85
+ def _fmt_ts(ts: str) -> str:
86
+ if not ts:
87
+ return ''
88
+ return ts[:16].replace('T', ' ') + ' UTC'
89
+
90
+
91
+ async def _run_tier1(config: dict) -> None:
92
+ """
93
+ Extract key facts from messages older than archive_days into Tier 1 rows.
94
+
95
+ Groups old messages by chat and day, batches each group through the LLM with
96
+ _TIER1_PROMPT, stores the extracted facts as summary_archive rows, and flags
97
+ source rows with archived_at. Logs warnings on batch failures but continues
98
+ processing other batches.
99
+
100
+ Args:
101
+ config: Dict with keys: chat_model, and optionally archive_days.
102
+ """
103
+ model = _get_model(config)
104
+ if not model:
105
+ logger.warning("ARCHIVE: no model configured, skipping Tier 1")
106
+ return
107
+
108
+ after_days = _get_archive_days(config)
109
+
110
+ async with aiosqlite.connect(get_db_path()) as db:
111
+ cursor = await db.execute(
112
+ """
113
+ SELECT m.id, m.chat_id, m.user_id, m.role, m.content, m.created_at,
114
+ u.first_name, u.username
115
+ FROM messages m
116
+ LEFT JOIN users u ON m.user_id = u.user_id
117
+ WHERE m.archived_at IS NULL AND m.is_private = 0
118
+ AND m.created_at < ?
119
+ ORDER BY m.chat_id, date(m.created_at), m.created_at ASC, m.id ASC
120
+ """,
121
+ (cutoff_iso(after_days),),
122
+ )
123
+ raw_rows = await cursor.fetchall()
124
+
125
+ if not raw_rows:
126
+ return
127
+
128
+ batches: dict[tuple, list] = {}
129
+ for row in raw_rows:
130
+ key = (row[1], row[5][:10]) # (chat_id, YYYY-MM-DD)
131
+ batches.setdefault(key, []).append(row)
132
+
133
+ provider = get_provider(model)
134
+ for (chat_id, day), batch in batches.items():
135
+ try:
136
+ await _process_tier1_batch(provider, model, chat_id, day, batch)
137
+ except Exception as e:
138
+ logger.warning(f"ARCHIVE: Tier 1 batch {chat_id}/{day} failed: {e}")
139
+
140
+
141
+ async def _process_tier1_batch(provider, model: str, chat_id: int, day: str, rows: list) -> None:
142
+ """
143
+ Extract key facts from a single day's batch of messages via LLM.
144
+
145
+ Formats messages with speaker/timestamp annotations, calls the provider with
146
+ _TIER1_PROMPT, inserts the result as a summary_archive row (tier 1), and marks
147
+ source rows with archived_at. For private chats (chat_id > 0), stores as a
148
+ single-user row; for groups, extracts participant user_ids and stores as
149
+ multi-speaker row with participants JSON array.
150
+
151
+ Args:
152
+ provider: LLM provider instance (e.g., from get_provider).
153
+ model: Model name to pass to provider.complete().
154
+ chat_id: Telegram chat ID.
155
+ day: YYYY-MM-DD date string (used for logging).
156
+ rows: List of message tuples from the database query.
157
+ """
158
+ # rows: (id, chat_id, user_id, role, content, created_at, first_name, username)
159
+ lines = []
160
+ for row in rows:
161
+ if row[3] == 'assistant':
162
+ speaker = 'Assistant'
163
+ else:
164
+ first_name, username = row[6], row[7]
165
+ speaker = first_name or (f"@{username}" if username else f"User {row[2]}")
166
+ lines.append(f"[{speaker}, {_fmt_ts(row[5])}]: {row[4]}")
167
+
168
+ conversation = '\n'.join(lines)
169
+ messages = [{"role": "user", "content": _TIER1_PROMPT.format(conversation=conversation)}]
170
+
171
+ result = await provider.complete(model, messages)
172
+ if not isinstance(result, str) or not result.strip():
173
+ return
174
+
175
+ user_ids = sorted({row[2] for row in rows if row[3] == 'user'})
176
+ is_private = chat_id > 0
177
+ if is_private and len(user_ids) == 1:
178
+ archive_user_id = user_ids[0]
179
+ participants_json = None
180
+ else:
181
+ archive_user_id = None
182
+ participants_json = json.dumps(user_ids) if user_ids else None
183
+
184
+ covers_from = rows[0][5]
185
+ covers_to = rows[-1][5]
186
+ msg_ids = [row[0] for row in rows]
187
+
188
+ async with aiosqlite.connect(get_db_path()) as db:
189
+ await db.execute(
190
+ "INSERT INTO summary_archive "
191
+ "(chat_id, user_id, participants, tier, content, covers_from, covers_to) "
192
+ "VALUES (?, ?, ?, 1, ?, ?, ?)",
193
+ (chat_id, archive_user_id, participants_json, result.strip(), covers_from, covers_to),
194
+ )
195
+ placeholders = ','.join('?' * len(msg_ids))
196
+ await db.execute(
197
+ f"UPDATE messages SET archived_at = ? WHERE id IN ({placeholders})",
198
+ [now_iso()] + msg_ids,
199
+ )
200
+ await db.commit()
201
+
202
+ logger.info(
203
+ f"ARCHIVE: Tier 1 stored for chat {chat_id} day {day} "
204
+ f"({len(rows)} messages -> {len(result.splitlines())} facts)"
205
+ )
206
+
207
+
208
+ async def _run_tier2(config: dict) -> None:
209
+ """
210
+ Compress old Tier 1 rows into Tier 2 (episodic) summaries.
211
+
212
+ Groups Tier 1 rows older than archive_days * 2 by chat and month, batches each
213
+ group through the LLM with _TIER2_PROMPT, stores the result as a summary_archive
214
+ row (tier 2), and flags source Tier 1 rows with archived_at. Logs warnings on
215
+ batch failures but continues processing other batches.
216
+
217
+ Args:
218
+ config: Dict with keys: chat_model, and optionally archive_days.
219
+ """
220
+ model = _get_model(config)
221
+ if not model:
222
+ return
223
+
224
+ after_days = _get_archive_days(config)
225
+ episode_days = after_days * 2
226
+
227
+ async with aiosqlite.connect(get_db_path()) as db:
228
+ cursor = await db.execute(
229
+ """
230
+ SELECT id, chat_id, user_id, participants, content, covers_from, covers_to
231
+ FROM summary_archive
232
+ WHERE tier = 1 AND archived_at IS NULL
233
+ AND covers_to < ?
234
+ ORDER BY chat_id, strftime('%Y-%m', covers_from), covers_from ASC
235
+ """,
236
+ (cutoff_iso(episode_days),),
237
+ )
238
+ tier1_rows = await cursor.fetchall()
239
+
240
+ if not tier1_rows:
241
+ return
242
+
243
+ batches: dict[tuple, list] = {}
244
+ for row in tier1_rows:
245
+ key = (row[1], row[5][:7]) # (chat_id, YYYY-MM)
246
+ batches.setdefault(key, []).append(row)
247
+
248
+ provider = get_provider(model)
249
+ for (chat_id, month), batch in batches.items():
250
+ try:
251
+ await _process_tier2_batch(provider, model, chat_id, month, batch)
252
+ except Exception as e:
253
+ logger.warning(f"ARCHIVE: Tier 2 batch {chat_id}/{month} failed: {e}")
254
+
255
+
256
+ async def _process_tier2_batch(provider, model: str, chat_id: int, month: str, rows: list) -> None:
257
+ """
258
+ Compress a month's worth of Tier 1 summaries into a single thematic digest.
259
+
260
+ Concatenates Tier 1 facts, calls the provider with _TIER2_PROMPT, inserts the
261
+ result as a summary_archive row (tier 2), and marks source Tier 1 rows with
262
+ archived_at. Merges participants from all source rows into a single JSON array
263
+ for the Tier 2 row (unless all rows are attributed to a single user, in which
264
+ case stores as single-user row).
265
+
266
+ Args:
267
+ provider: LLM provider instance (e.g., from get_provider).
268
+ model: Model name to pass to provider.complete().
269
+ chat_id: Telegram chat ID.
270
+ month: YYYY-MM month string (used for logging).
271
+ rows: List of Tier 1 archive tuples from the database query.
272
+ """
273
+ # rows: (id, chat_id, user_id, participants, content, covers_from, covers_to)
274
+ facts = '\n'.join(row[4] for row in rows)
275
+ messages = [{"role": "user", "content": _TIER2_PROMPT.format(facts=facts)}]
276
+
277
+ result = await provider.complete(model, messages)
278
+ if not isinstance(result, str) or not result.strip():
279
+ return
280
+
281
+ all_user_ids: set[int] = set()
282
+ for row in rows:
283
+ if row[2] is not None:
284
+ all_user_ids.add(row[2])
285
+ if row[3]:
286
+ try:
287
+ all_user_ids.update(json.loads(row[3]))
288
+ except (json.JSONDecodeError, TypeError):
289
+ pass
290
+
291
+ unique_attributed = {row[2] for row in rows if row[2] is not None}
292
+ has_multi_speaker = any(row[3] for row in rows)
293
+ if len(unique_attributed) == 1 and not has_multi_speaker:
294
+ archive_user_id = next(iter(unique_attributed))
295
+ participants_json = None
296
+ else:
297
+ archive_user_id = None
298
+ participants_json = json.dumps(sorted(all_user_ids)) if all_user_ids else None
299
+
300
+ covers_from = rows[0][5]
301
+ covers_to = rows[-1][6]
302
+ source_ids = [row[0] for row in rows]
303
+
304
+ async with aiosqlite.connect(get_db_path()) as db:
305
+ await db.execute(
306
+ "INSERT INTO summary_archive "
307
+ "(chat_id, user_id, participants, tier, content, covers_from, covers_to) "
308
+ "VALUES (?, ?, ?, 2, ?, ?, ?)",
309
+ (chat_id, archive_user_id, participants_json, result.strip(), covers_from, covers_to),
310
+ )
311
+ placeholders = ','.join('?' * len(source_ids))
312
+ await db.execute(
313
+ f"UPDATE summary_archive SET archived_at = ? WHERE id IN ({placeholders})",
314
+ [now_iso()] + source_ids,
315
+ )
316
+ await db.commit()
317
+
318
+ logger.info(
319
+ f"ARCHIVE: Tier 2 stored for chat {chat_id} month {month} "
320
+ f"({len(rows)} Tier 1 rows -> 1 episode)"
321
+ )