TeLLMgramBot 3.15.0__tar.gz → 3.15.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/PKG-INFO +4 -2
  2. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/README.md +1 -1
  3. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/TeLLMgramBot.py +109 -55
  4. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/archive.py +39 -8
  5. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/initialize.py +9 -11
  6. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/message_handlers.py +120 -13
  7. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/PKG-INFO +4 -2
  8. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/requires.txt +2 -0
  9. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/setup.py +3 -1
  10. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/LICENSE +0 -0
  11. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/__init__.py +0 -0
  12. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/conversation.py +0 -0
  13. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/database.py +0 -0
  14. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/models.py +0 -0
  15. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/__init__.py +0 -0
  16. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/anthropic_provider.py +0 -0
  17. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/base.py +0 -0
  18. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/factory.py +0 -0
  19. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/openai_provider.py +0 -0
  20. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/tools.py +0 -0
  21. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/utils.py +0 -0
  22. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/web_utils.py +0 -0
  23. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/SOURCES.txt +0 -0
  24. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/dependency_links.txt +0 -0
  25. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/top_level.txt +0 -0
  26. {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TeLLMgramBot
3
- Version: 3.15.0
3
+ Version: 3.15.2
4
4
  Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
5
5
  Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
6
6
  Author: Digital Heresy
@@ -22,6 +22,8 @@ Requires-Dist: tzdata>=2025.2
22
22
  Requires-Dist: pypdf>=6.0
23
23
  Requires-Dist: defusedxml>=0.7
24
24
  Requires-Dist: charset-normalizer>=3.0
25
+ Requires-Dist: python-docx>=1.2
26
+ Requires-Dist: openpyxl>=3.1
25
27
  Dynamic: author
26
28
  Dynamic: author-email
27
29
  Dynamic: description
@@ -45,7 +47,7 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
45
47
  * Example: "What do you think of this article? [https://some_site/article]"
46
48
  * Uses a separate model (configurable via `url_model`) to handle larger URL content.
47
49
  * Share documents and text files for analysis and summarisation.
48
- * Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
50
+ * Supported formats: PDF (via pypdf), Microsoft Office documents (.docx via python-docx, .xlsx via openpyxl), plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML (via defusedxml).
49
51
  * The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
50
52
  * Can be disabled via `document_processing: false` in config.
51
53
  * Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
@@ -10,7 +10,7 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
10
10
  * Example: "What do you think of this article? [https://some_site/article]"
11
11
  * Uses a separate model (configurable via `url_model`) to handle larger URL content.
12
12
  * Share documents and text files for analysis and summarisation.
13
- * Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
13
+ * Supported formats: PDF (via pypdf), Microsoft Office documents (.docx via python-docx, .xlsx via openpyxl), plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML (via defusedxml).
14
14
  * The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
15
15
  * Can be disabled via `document_processing: false` in config.
16
16
  * Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
@@ -63,6 +63,22 @@ _MSG_FORGET_PROMPT = "Do you really want me to forget our memories together
63
63
  _MSG_FORGET_COMPLETE = "Forget complete. Fresh start it is..."
64
64
  _MSG_FORGET_CANCELLED = "Forget cancelled. Glad you changed your mind!"
65
65
 
66
+
67
+ def _validated_allow_local(value) -> bool:
68
+ """
69
+ Strictly validate the allow_local_webhooks config value, which gates an SSRF guard.
70
+
71
+ Only a literal `True` enables it; any other truthy non-bool (e.g. a quoted "false"
72
+ string) must not, so logs a warning and defaults to False.
73
+
74
+ Args:
75
+ value: Raw `allow_local_webhooks` value from bot config.
76
+ """
77
+ if value is not None and not isinstance(value, bool):
78
+ logger.warning(f"Invalid allow_local_webhooks '{value}' (must be true/false); defaulting to false")
79
+ return value is True
80
+
81
+
66
82
  _SEARCH_TOOL = {
67
83
  "name": "search_messages",
68
84
  "description": (
@@ -926,17 +942,16 @@ class TelegramBot:
926
942
  """
927
943
  Route Telegram document messages through the document summarisation pipeline.
928
944
 
929
- Group trigger conditions (caption @mention, nickname/initials match, or reply-to-bot)
930
- are resolved via the shared _resolve_group_trigger() also used by tele_handle_message,
931
- including the exclusive-foreign-mention yield on reply-to-bot threads. Silently ignores
932
- documents in channels, edited messages, and in groups/supergroups where no trigger
933
- condition matched. Once triggered, respects the same global online/offline gate as
934
- tele_handle_response() (set via /start, /stop) - replies with the offline message
935
- rather than processing while offline. When document_processing is disabled in config,
936
- replies with a friendly message instead of processing - but only when the message was
937
- otherwise triggered (private chat, or a matched group trigger); untriggered group
938
- documents still yield silently regardless of the flag. Files over 20 MB receive a
939
- friendly error before download.
945
+ Gates checked in order:
946
+ trigger - shared _resolve_group_trigger() (mention/nickname/initials/reply-to-bot,
947
+ incl. exclusive-foreign-mention yield); silent in channels, edited
948
+ messages, and untriggered group messages.
949
+ online - same self._online gate as tele_handle_response() (/start, /stop);
950
+ offline reply if down.
951
+ processing - document_processing config flag; friendly reply if disabled, but
952
+ only once triggered (untriggered group documents stay silent
953
+ regardless of the flag).
954
+ file size - friendly error over 20 MB, checked before download.
940
955
 
941
956
  The user message stored in DB is '[Document: filename] caption'; document
942
957
  bytes are never persisted. Respects is_private for cross-chat context isolation.
@@ -1228,6 +1243,39 @@ class TelegramBot:
1228
1243
  """Reply to unrecognized commands so the LLM never sees them."""
1229
1244
  await update.message.reply_text("Unknown command. Use /help to see available commands.")
1230
1245
 
1246
+ async def _post_init(self, application: Application) -> None:
1247
+ """
1248
+ Schedule archival and MCP tool discovery as background tasks once the polling
1249
+ event loop is live.
1250
+
1251
+ Registered as python-telegram-bot's post_init hook so a large archival backlog or slow/unreachable MCP
1252
+ servers never block tele_handle_message/tele_handle_document from answering the first incoming update.
1253
+ Uses application.create_task() rather than asyncio.create_task() so both tasks are tracked and
1254
+ cancelled cleanly on shutdown.
1255
+ """
1256
+ if self._mcp_entries:
1257
+ application.create_task(self._discover_mcp_tools_background())
1258
+ application.create_task(run_archival(self.llm))
1259
+
1260
+ async def _discover_mcp_tools_background(self) -> None:
1261
+ """
1262
+ Discover MCP tools and merge them into self.webhook_schemas/self.webhook_defs.
1263
+
1264
+ Runs as a background task scheduled by _post_init() so MCP server round-trips
1265
+ never block startup. Until this completes, owner-triggered tool calls simply see
1266
+ the webhook-only tool set; MCP tools become available once discovery finishes.
1267
+ """
1268
+ try:
1269
+ existing_names = set(self.webhook_defs.keys()) | {'search_messages'}
1270
+ mcp_schemas, mcp_defs = await discover_mcp_tools(
1271
+ self._mcp_entries, existing_names, allow_local=self._allow_local_webhooks,
1272
+ )
1273
+ self.webhook_schemas = self.webhook_schemas + mcp_schemas
1274
+ self.webhook_defs = {**self.webhook_defs, **mcp_defs}
1275
+ logger.info(f"MCP discovery complete: {len(mcp_schemas)} tool(s) registered")
1276
+ except Exception:
1277
+ logger.error("Background MCP discovery failed", exc_info=True)
1278
+
1231
1279
  def poll(self):
1232
1280
  """
1233
1281
  Start the main polling loop for Telegram updates.
@@ -1240,14 +1288,14 @@ class TelegramBot:
1240
1288
 
1241
1289
  # Initialization
1242
1290
  def __init__(self,
1243
- bot_owner = INIT_BOT_CONFIG['bot_owner'],
1244
- bot_nickname = INIT_BOT_CONFIG['bot_nickname'],
1245
- bot_initials = INIT_BOT_CONFIG['bot_initials'],
1246
- chat_model = INIT_BOT_CONFIG['chat_model'],
1247
- url_model = INIT_BOT_CONFIG['url_model'],
1248
- token_limit = INIT_BOT_CONFIG['token_limit'],
1249
- search_limit = INIT_BOT_CONFIG['search_limit'],
1250
- persona_temp = INIT_BOT_CONFIG['persona_temp'],
1291
+ bot_owner = INIT_BOT_CONFIG['bot_owner'],
1292
+ bot_nickname = INIT_BOT_CONFIG['bot_nickname'],
1293
+ bot_initials = INIT_BOT_CONFIG['bot_initials'],
1294
+ chat_model = INIT_BOT_CONFIG['chat_model'],
1295
+ url_model = INIT_BOT_CONFIG['url_model'],
1296
+ token_limit = INIT_BOT_CONFIG['token_limit'],
1297
+ search_limit = INIT_BOT_CONFIG['search_limit'],
1298
+ persona_temp = INIT_BOT_CONFIG['persona_temp'],
1251
1299
  archive_days = INIT_BOT_CONFIG['archive_days'],
1252
1300
  document_processing = INIT_BOT_CONFIG['document_processing'],
1253
1301
  persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
@@ -1255,6 +1303,8 @@ class TelegramBot:
1255
1303
  instance_name: str | None = None,
1256
1304
  webhook_schemas: list | None = None,
1257
1305
  webhook_defs: dict | None = None,
1306
+ mcp_entries: list | None = None,
1307
+ allow_local_webhooks: bool = False,
1258
1308
  ):
1259
1309
  """
1260
1310
  Initialize the Telegram bot with LLM configuration and API keys.
@@ -1280,6 +1330,10 @@ class TelegramBot:
1280
1330
  If None, no webhook tools are registered.
1281
1331
  webhook_defs: Resolved webhook tool definitions keyed by tool name (from build_tool_registry).
1282
1332
  If None, no webhook tools are registered.
1333
+ mcp_entries: Raw 'mcp_server:' config entries (from the 'tools:' block), or None.
1334
+ _post_init() discovers these in the background and merges results into
1335
+ self.webhook_schemas/self.webhook_defs once discovery completes.
1336
+ allow_local_webhooks: Passed through to discover_mcp_tools() when MCP discovery runs.
1283
1337
 
1284
1338
  Side Effects:
1285
1339
  - Normalises bot_owner to list[str] and stores in self.telegram['owners'].
@@ -1296,7 +1350,9 @@ class TelegramBot:
1296
1350
  self.token_warning = {} # Determines whether user has reached token limit by AI model
1297
1351
  self.conversations = {} # Provides Conversation class per user based on bot response
1298
1352
  self.webhook_schemas = webhook_schemas or [] # Provider-compatible schemas for webhook tools
1299
- self.webhook_defs = webhook_defs or {} # Resolved tool definitions keyed by name
1353
+ self.webhook_defs = webhook_defs or {} # Resolved tool definitions keyed by name
1354
+ self._mcp_entries = mcp_entries or [] # Raw mcp_server entries; discovered in _post_init()
1355
+ self._allow_local_webhooks = allow_local_webhooks
1300
1356
  owners = bot_owner if isinstance(bot_owner, list) else [bot_owner]
1301
1357
  self.telegram = {
1302
1358
  'bot_id' : 0, # overwritten by _tele_info(); 0 is a safe sentinel
@@ -1319,7 +1375,12 @@ class TelegramBot:
1319
1375
  loop.create_task(self._tele_info())
1320
1376
 
1321
1377
  # Build our application with handlers for Commands, Messages, and Errors
1322
- self.telegram['app'] = Application.builder().token(os.environ['TELLMGRAMBOT_TELEGRAM_API_KEY']).build()
1378
+ self.telegram['app'] = (
1379
+ Application.builder()
1380
+ .token(os.environ['TELLMGRAMBOT_TELEGRAM_API_KEY'])
1381
+ .post_init(self._post_init)
1382
+ .build()
1383
+ )
1323
1384
  self.telegram['app'].add_handler(CommandHandler('help', self.tele_commands))
1324
1385
  self.telegram['app'].add_handler(CommandHandler('start', self.tele_start_command))
1325
1386
  self.telegram['app'].add_handler(CommandHandler('stop', self.tele_stop_command))
@@ -1391,9 +1452,10 @@ class TelegramBot:
1391
1452
 
1392
1453
  Calls init_structure() to bootstrap directories, API keys, and configuration files,
1393
1454
  unpacking a three-tuple (ApiKeyStatus, config dict, persona prompt str with system
1394
- appendix already appended). Builds webhook tool registry from 'tools:' config, discovers
1395
- MCP tools from any 'mcp_server:' entries, and merges both into the final tool registry.
1396
- Applies defaults for any missing values and returns a fully initialized TelegramBot.
1455
+ appendix already appended). Builds the webhook tool registry from 'tools:' config and
1456
+ collects any 'mcp_server:' entries (passed through to TelegramBot for later discovery -
1457
+ see Side Effects). Applies defaults for any missing values and returns a fully
1458
+ initialized TelegramBot.
1397
1459
 
1398
1460
  Args:
1399
1461
  config_file: Filename of the bot configuration YAML (default: 'config.yaml').
@@ -1406,50 +1468,42 @@ class TelegramBot:
1406
1468
 
1407
1469
  Side Effects:
1408
1470
  - Calls init_structure() which creates directories, config/prompt files, and checks API keys.
1409
- - Calls discover_mcp_tools() if any 'mcp_server:' entries are in config (gracefully degrades if called from async context).
1410
- - May log warnings (for missing config values, empty prompt, or skipped MCP discovery), but does not print a startup API key status summary.
1471
+ - Passes any 'mcp_server:' entries to TelegramBot as mcp_entries; discovery itself runs
1472
+ in the background via TelegramBot._post_init() once the polling event loop is live.
1473
+ - May log warnings (for missing config values or an empty prompt), but does not print a
1474
+ startup API key status summary.
1411
1475
  - Log identity/file label is taken from bot config `instance_name` when set; otherwise defaults to the bot's Telegram username once _tele_info() resolves.
1412
1476
  """
1413
1477
  # Bootstrap directories, logging, config, prompt (with appendix), and API keys in one call.
1414
1478
  key_status, config, prompt = init_structure(config_file, prompt_file)
1415
1479
 
1416
1480
  # Build the webhook tool registry from the optional 'tools:' block in bot config.
1417
- allow_local = config['allow_local_webhooks'] or False
1481
+ allow_local = _validated_allow_local(config['allow_local_webhooks'])
1418
1482
  webhook_schemas, webhook_defs = build_tool_registry(config.get('tools') or [], allow_local)
1419
1483
 
1420
- # Discover MCP tools from any 'mcp_server:' entries in the tools config.
1484
+ # Raw mcp_server entries; TelegramBot._post_init() runs discovery in the background.
1421
1485
  mcp_entries = [
1422
1486
  e for e in (config.get('tools') or [])
1423
1487
  if isinstance(e, dict) and 'mcp_server' in e
1424
1488
  ]
1425
- if mcp_entries:
1426
- existing_names = set(webhook_defs.keys()) | {'search_messages'}
1427
- mcp_schemas, mcp_defs = [], {}
1428
- try:
1429
- asyncio.get_running_loop()
1430
- logger.warning("MCP discovery skipped: set() called from within an async context.")
1431
- except RuntimeError:
1432
- mcp_schemas, mcp_defs = asyncio.run(discover_mcp_tools(
1433
- mcp_entries, existing_names, allow_local=allow_local,
1434
- ))
1435
- webhook_schemas = webhook_schemas + mcp_schemas
1436
- webhook_defs = {**webhook_defs, **mcp_defs}
1437
1489
 
1438
1490
  # Apply parameters to bot:
1439
1491
  return TelegramBot(
1440
- bot_owner = config['bot_owner'],
1441
- bot_nickname = config['bot_nickname'],
1442
- bot_initials = config['bot_initials'],
1443
- chat_model = config['chat_model'],
1444
- url_model = config['url_model'],
1445
- token_limit = config['token_limit'],
1446
- search_limit = config['search_limit'],
1447
- persona_temp = config['persona_temp'],
1448
- archive_days = config['archive_days'],
1449
- document_processing = config.get('document_processing'),
1450
- persona_prompt = prompt,
1451
- key_status = key_status,
1452
- instance_name = config['instance_name'],
1453
- webhook_schemas = webhook_schemas,
1454
- webhook_defs = webhook_defs,
1492
+ bot_owner = config['bot_owner'],
1493
+ bot_nickname = config['bot_nickname'],
1494
+ bot_initials = config['bot_initials'],
1495
+ chat_model = config['chat_model'],
1496
+ url_model = config['url_model'],
1497
+ token_limit = config['token_limit'],
1498
+ search_limit = config['search_limit'],
1499
+ persona_temp = config['persona_temp'],
1500
+ archive_days = config['archive_days'],
1501
+ document_processing = config['document_processing'],
1502
+ persona_prompt = prompt,
1503
+ key_status = key_status,
1504
+ instance_name = config['instance_name'],
1505
+ webhook_schemas = webhook_schemas,
1506
+ webhook_defs = webhook_defs,
1507
+ mcp_entries = mcp_entries,
1508
+ allow_local_webhooks = allow_local,
1455
1509
  )
@@ -11,6 +11,7 @@ Tier 2: Episodic summarization - old Tier 1 rows compressed into thematic digest
11
11
  Raw messages are never deleted; archived_at flags rows to skip during context loading.
12
12
  Search still hits raw rows regardless of archived_at.
13
13
  """
14
+ import asyncio
14
15
  import json
15
16
  import logging
16
17
 
@@ -24,6 +25,10 @@ logger = logging.getLogger(__name__)
24
25
 
25
26
  _archival_running = False
26
27
 
28
+ # Caps concurrent in-flight LLM calls per run_archival() invocation so a large
29
+ # backlog doesn't hammer the provider's rate limits.
30
+ _BATCH_CONCURRENCY = 5
31
+
27
32
  _TIER1_PROMPT = (
28
33
  "Extract key facts from this conversation. "
29
34
  "Ignore greetings, acknowledgments, and filler. "
@@ -88,14 +93,34 @@ def _fmt_ts(ts: str) -> str:
88
93
  return ts[:16].replace('T', ' ') + ' UTC'
89
94
 
90
95
 
96
+ async def _gather_bounded(coros) -> None:
97
+ """
98
+ Run an iterable of coroutines with at most _BATCH_CONCURRENCY in flight at once.
99
+
100
+ Pulls coroutines from the iterable lazily via a fixed pool of _BATCH_CONCURRENCY
101
+ workers, rather than wrapping every coroutine in a Task up front - so a large
102
+ archival backlog never creates more pending tasks than the concurrency cap.
103
+
104
+ Args:
105
+ coros: Iterable of coroutines to execute with bounded concurrency.
106
+ """
107
+ coro_iter = iter(coros)
108
+
109
+ async def _worker():
110
+ for coro in coro_iter:
111
+ await coro
112
+
113
+ await asyncio.gather(*(_worker() for _ in range(_BATCH_CONCURRENCY)))
114
+
115
+
91
116
  async def _run_tier1(config: dict) -> None:
92
117
  """
93
118
  Extract key facts from messages older than archive_days into Tier 1 rows.
94
119
 
95
120
  Groups old messages by chat and day, batches each group through the LLM with
96
121
  _TIER1_PROMPT, stores the extracted facts as summary_archive rows, and flags
97
- source rows with archived_at. Logs warnings on batch failures but continues
98
- processing other batches.
122
+ source rows with archived_at. Batches run concurrently, bounded by _BATCH_CONCURRENCY.
123
+ Logs warnings on batch failures but continues processing other batches.
99
124
 
100
125
  Args:
101
126
  config: Dict with keys: chat_model, and optionally archive_days.
@@ -131,12 +156,15 @@ async def _run_tier1(config: dict) -> None:
131
156
  batches.setdefault(key, []).append(row)
132
157
 
133
158
  provider = get_provider(model)
134
- for (chat_id, day), batch in batches.items():
159
+
160
+ async def _process(chat_id, day, batch):
135
161
  try:
136
162
  await _process_tier1_batch(provider, model, chat_id, day, batch)
137
163
  except Exception as e:
138
164
  logger.warning(f"ARCHIVE: Tier 1 batch {chat_id}/{day} failed: {e}")
139
165
 
166
+ await _gather_bounded(_process(chat_id, day, batch) for (chat_id, day), batch in batches.items())
167
+
140
168
 
141
169
  async def _process_tier1_batch(provider, model: str, chat_id: int, day: str, rows: list) -> None:
142
170
  """
@@ -209,10 +237,10 @@ async def _run_tier2(config: dict) -> None:
209
237
  """
210
238
  Compress old Tier 1 rows into Tier 2 (episodic) summaries.
211
239
 
212
- Groups Tier 1 rows older than archive_days * 2 by chat and month, batches each
213
- group through the LLM with _TIER2_PROMPT, stores the result as a summary_archive
214
- row (tier 2), and flags source Tier 1 rows with archived_at. Logs warnings on
215
- batch failures but continues processing other batches.
240
+ Groups Tier 1 rows older than archive_days * 2 by chat and month, batches each group through the LLM
241
+ with _TIER2_PROMPT, stores the result as a summary_archive row (tier 2), and flags source Tier 1 rows
242
+ with archived_at. Batches run concurrently, bounded by _BATCH_CONCURRENCY.
243
+ Logs warnings on batch failures but continues processing other batches.
216
244
 
217
245
  Args:
218
246
  config: Dict with keys: chat_model, and optionally archive_days.
@@ -246,12 +274,15 @@ async def _run_tier2(config: dict) -> None:
246
274
  batches.setdefault(key, []).append(row)
247
275
 
248
276
  provider = get_provider(model)
249
- for (chat_id, month), batch in batches.items():
277
+
278
+ async def _process(chat_id, month, batch):
250
279
  try:
251
280
  await _process_tier2_batch(provider, model, chat_id, month, batch)
252
281
  except Exception as e:
253
282
  logger.warning(f"ARCHIVE: Tier 2 batch {chat_id}/{month} failed: {e}")
254
283
 
284
+ await _gather_bounded(_process(chat_id, month, batch) for (chat_id, month), batch in batches.items())
285
+
255
286
 
256
287
  async def _process_tier2_batch(provider, model: str, chat_id: int, month: str, rows: list) -> None:
257
288
  """
@@ -13,7 +13,6 @@ from typing import Optional
13
13
 
14
14
  from .utils import read_text, generate_file_path, read_yaml, execution_dir, generate_filename
15
15
  from .database import set_db_filename, init_db, get_db_path
16
- from .archive import run_archival
17
16
 
18
17
  logger = logging.getLogger(__name__)
19
18
  _logging_initialized = False
@@ -529,6 +528,9 @@ def init_structure(
529
528
  the custom filename (only if the target does not yet exist). This ordering ensures the DB
530
529
  filename is settled before schema init.
531
530
 
531
+ Archival (Tier 1 and Tier 2) runs as a background task scheduled by TelegramBot._post_init()
532
+ once the polling event loop is live, so large backlogs never block the bot's first message response.
533
+
532
534
  Args:
533
535
  config_file: Name of the bot configuration file (default: 'config.yaml').
534
536
  Resolved relative to TELLMGRAMBOT_CONFIGS_PATH.
@@ -576,9 +578,9 @@ def init_structure(
576
578
  else:
577
579
  logger.debug(f"DB migration skipped: conversations.db not found at {legacy_path}")
578
580
 
579
- # In the sync path (no running loop), run DB init now before keys are loaded.
580
- # In the async path (running loop), DB init is deferred into the background task
581
- # so it runs after keys are loaded (below).
581
+ # In the sync path (no running loop), run DB init now.
582
+ # In the async path (running loop - e.g. a caller awaiting init_structure() from
583
+ # inside an already-async context), DB init is deferred into a background task.
582
584
  try:
583
585
  loop = asyncio.get_running_loop()
584
586
  _has_loop = True
@@ -593,18 +595,14 @@ def init_structure(
593
595
  logger.warning(f"File '{prompt_file}' is empty, using default persona prompt.")
594
596
  prompt = prompt.rstrip() + "\n\n" + _SYSTEM_APPENDIX
595
597
 
596
- # Load API keys before running archival so the provider can authenticate.
597
598
  key_status = init_keys(config)
598
599
 
599
600
  if _has_loop:
600
- async def _init_and_archive():
601
+ async def _init_db_task():
601
602
  try:
602
603
  await init_db()
603
- await run_archival(config)
604
604
  except Exception:
605
- logger.error(f"Background startup initialization/archive task failed", exc_info=True)
606
- loop.create_task(_init_and_archive())
607
- else:
608
- asyncio.run(run_archival(config))
605
+ logger.error("Background startup DB initialization failed", exc_info=True)
606
+ loop.create_task(_init_db_task())
609
607
 
610
608
  return (key_status, config, prompt)
@@ -2,11 +2,14 @@
2
2
  import io
3
3
  import logging
4
4
  import re
5
+ import zipfile
5
6
  from pathlib import Path
6
7
  from typing import Optional
7
8
 
8
9
  from charset_normalizer import from_bytes as _cn_from_bytes
9
10
  import defusedxml.ElementTree as _defusedxml_ET
11
+ import docx
12
+ import openpyxl
10
13
  import pypdf
11
14
 
12
15
  from .utils import log_error
@@ -56,7 +59,13 @@ _PLAIN_TEXT_EXTENSIONS = frozenset({
56
59
 
57
60
  _HTML_MIMES = frozenset({'text/html', 'application/xhtml+xml'})
58
61
  _XML_MIMES = frozenset({'text/xml', 'application/xml'})
62
+ _DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
63
+ _XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
59
64
  _PDF_PAGE_CAP = 100
65
+ _DOCX_LINE_CAP = 2000
66
+ _XLSX_ROW_CAP = 2000
67
+ _ZIP_ENTRY_SIZE_CAP = 100_000_000 # 100 MB uncompressed per entry - zip-bomb guard
68
+ _ZIP_TOTAL_SIZE_CAP = 500_000_000 # 500 MB uncompressed across all entries combined
60
69
 
61
70
 
62
71
  def handle_greetings(text: str) -> Optional[str]:
@@ -87,6 +96,39 @@ def handle_common_queries(text: str) -> Optional[str]:
87
96
  return None
88
97
 
89
98
 
99
+ def _zip_entries_within_cap(file_bytes: bytes) -> bool:
100
+ """
101
+ Check declared uncompressed entry sizes in a zip-based file before decompression.
102
+
103
+ Guards against zip-bomb style resource exhaustion for .docx/.xlsx (both are zip
104
+ containers), without actually decompressing anything. Rejects a file if any single
105
+ entry's declared uncompressed size exceeds _ZIP_ENTRY_SIZE_CAP, or if the sum across
106
+ all entries exceeds _ZIP_TOTAL_SIZE_CAP (catches many small entries that each pass
107
+ the per-entry cap but together still exhaust memory). Returns False for a corrupted/
108
+ non-zip file too, since python-docx/openpyxl would fail on it anyway.
109
+
110
+ Args:
111
+ file_bytes: Raw file bytes to inspect.
112
+
113
+ Returns:
114
+ True if the file is a valid zip, every entry's declared size is within the
115
+ per-entry cap, and the total declared size is within the total cap; False
116
+ otherwise.
117
+ """
118
+ try:
119
+ with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
120
+ total = 0
121
+ for info in zf.infolist():
122
+ if info.file_size > _ZIP_ENTRY_SIZE_CAP:
123
+ return False
124
+ total += info.file_size
125
+ if total > _ZIP_TOTAL_SIZE_CAP:
126
+ return False
127
+ return True
128
+ except Exception:
129
+ return False
130
+
131
+
90
132
  def _decode_bytes(raw: bytes) -> tuple:
91
133
  """
92
134
  Decode raw bytes to a string via UTF-8 -> charset-normalizer -> Latin-1 chain.
@@ -108,12 +150,16 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
108
150
  """
109
151
  Extract plain text from document bytes, routing by MIME type and file extension.
110
152
 
111
- PDF text is extracted via pypdf (capped at _PDF_PAGE_CAP pages, strict=False).
112
- HTML content has tags stripped via strip_html_markup. XML is safely parsed via
113
- defusedxml to extract text nodes without XXE risk; falls back to plain-text
114
- decode if the XML is malformed. All other plain-text types are decoded using
115
- a UTF-8 -> charset-normalizer -> Latin-1 chain; non-UTF-8 files prepend a
116
- [File encoding: ...] annotation so the LLM has context.
153
+ Extraction by type:
154
+ PDF - pypdf, capped at _PDF_PAGE_CAP pages, strict=False.
155
+ .docx - python-docx; paragraphs + flattened table rows, capped at _DOCX_LINE_CAP combined lines.
156
+ .xlsx - openpyxl in read_only/data_only mode; capped at _XLSX_ROW_CAP rows total across all sheets.
157
+ .docx/.xlsx are zip containers, so _zip_entries_within_cap() rejects implausible declared
158
+ uncompressed entry sizes before either library decompresses anything (zip-bomb guard).
159
+ HTML - tags stripped via strip_html_markup.
160
+ XML - defusedxml (XXE-safe); falls back to plain-text decode if malformed.
161
+ other - UTF-8 -> charset-normalizer -> Latin-1 chain
162
+ non-UTF-8 files get a [File encoding: ...] prefix.
117
163
 
118
164
  Args:
119
165
  file_bytes: Raw document bytes downloaded from Telegram.
@@ -128,6 +174,8 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
128
174
  ext = Path(filename).suffix.lower() if filename else ''
129
175
 
130
176
  is_pdf = mime == 'application/pdf' or ext == '.pdf'
177
+ is_docx = ext == '.docx' or mime == _DOCX_MIME
178
+ is_xlsx = ext == '.xlsx' or mime == _XLSX_MIME
131
179
  is_html = ext in ('.html', '.htm') or mime in _HTML_MIMES
132
180
  is_xml = ext == '.xml' or mime in _XML_MIMES
133
181
  is_plain = mime.startswith('text/') or ext in _PLAIN_TEXT_EXTENSIONS
@@ -145,6 +193,67 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
145
193
  log_error(e, 'PDF')
146
194
  return None, "Something went wrong while reading that PDF. Please try again."
147
195
 
196
+ if is_docx:
197
+ if not _zip_entries_within_cap(file_bytes):
198
+ return None, "Something went wrong while reading that document. Please try again."
199
+ try:
200
+ document = docx.Document(io.BytesIO(file_bytes))
201
+ lines = []
202
+ for p in document.paragraphs:
203
+ if len(lines) >= _DOCX_LINE_CAP:
204
+ break
205
+ if p.text:
206
+ lines.append(p.text)
207
+ for table in document.tables:
208
+ if len(lines) >= _DOCX_LINE_CAP:
209
+ break
210
+ for row in table.rows:
211
+ if len(lines) >= _DOCX_LINE_CAP:
212
+ break
213
+ row_text = '\t'.join(cell.text for cell in row.cells)
214
+ if row_text.strip():
215
+ lines.append(row_text)
216
+ text = '\n'.join(lines)
217
+ if not text.strip():
218
+ return None, "This document appears to have no readable text in it."
219
+ return text, None
220
+ except Exception as e:
221
+ log_error(e, 'DOCX')
222
+ return None, "Something went wrong while reading that document. Please try again."
223
+
224
+ if is_xlsx:
225
+ if not _zip_entries_within_cap(file_bytes):
226
+ return None, "Something went wrong while reading that spreadsheet. Please try again."
227
+ workbook = None
228
+ try:
229
+ workbook = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
230
+ lines = []
231
+ total_rows = 0
232
+ for sheet in workbook.worksheets:
233
+ if total_rows >= _XLSX_ROW_CAP:
234
+ break
235
+ sheet_lines = []
236
+ for row in sheet.iter_rows(values_only=True):
237
+ if total_rows >= _XLSX_ROW_CAP:
238
+ break
239
+ values = ['' if v is None else str(v) for v in row]
240
+ if any(values):
241
+ sheet_lines.append('\t'.join(values))
242
+ total_rows += 1
243
+ if sheet_lines:
244
+ lines.append(f"## Sheet: {sheet.title}")
245
+ lines.extend(sheet_lines)
246
+ text = '\n'.join(lines)
247
+ if not text.strip():
248
+ return None, "This spreadsheet appears to have no readable data in it."
249
+ return text, None
250
+ except Exception as e:
251
+ log_error(e, 'XLSX')
252
+ return None, "Something went wrong while reading that spreadsheet. Please try again."
253
+ finally:
254
+ if workbook is not None:
255
+ workbook.close()
256
+
148
257
  if is_html:
149
258
  raw_text, _ = _decode_bytes(file_bytes)
150
259
  return strip_html_markup(raw_text), None
@@ -166,7 +275,7 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
166
275
  text = f"[File encoding: {encoding}]\n{text}"
167
276
  return text, None
168
277
 
169
- return None, "I can only read plain text and PDF files right now."
278
+ return None, "I can only read PDF, Word documents (.docx), spreadsheets (.xlsx), HTML, XML, and plain text files right now."
170
279
 
171
280
 
172
281
  async def summarise_text(
@@ -179,12 +288,10 @@ async def summarise_text(
179
288
  """
180
289
  Token-prune content, apply template, and complete via the LLM.
181
290
 
182
- Prunes content so the fully composed system message (prompt + template with content
183
- substituted) fits within the model's token budget (max_tokens - 500), then calls the LLM.
184
- Token counting is measured against the composed message at every pruning step, not just
185
- the raw content, so the budget guarantee matches what is actually sent to the provider -
186
- a large template or persona prompt is accounted for, not just the content itself. The
187
- template must contain a {content} placeholder.
291
+ Prunes content so the fully composed message (prompt + template + content) fits the
292
+ model's token budget (max_tokens - 500), measuring the composed message at every
293
+ pruning step - not just raw content - so the budget guarantee matches what's actually
294
+ sent. The template must contain a {content} placeholder.
188
295
 
189
296
  Args:
190
297
  content: Text content to summarise (URL body or document text).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TeLLMgramBot
3
- Version: 3.15.0
3
+ Version: 3.15.2
4
4
  Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
5
5
  Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
6
6
  Author: Digital Heresy
@@ -22,6 +22,8 @@ Requires-Dist: tzdata>=2025.2
22
22
  Requires-Dist: pypdf>=6.0
23
23
  Requires-Dist: defusedxml>=0.7
24
24
  Requires-Dist: charset-normalizer>=3.0
25
+ Requires-Dist: python-docx>=1.2
26
+ Requires-Dist: openpyxl>=3.1
25
27
  Dynamic: author
26
28
  Dynamic: author-email
27
29
  Dynamic: description
@@ -45,7 +47,7 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
45
47
  * Example: "What do you think of this article? [https://some_site/article]"
46
48
  * Uses a separate model (configurable via `url_model`) to handle larger URL content.
47
49
  * Share documents and text files for analysis and summarisation.
48
- * Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
50
+ * Supported formats: PDF (via pypdf), Microsoft Office documents (.docx via python-docx, .xlsx via openpyxl), plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML (via defusedxml).
49
51
  * The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
50
52
  * Can be disabled via `document_processing: false` in config.
51
53
  * Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
@@ -11,3 +11,5 @@ tzdata>=2025.2
11
11
  pypdf>=6.0
12
12
  defusedxml>=0.7
13
13
  charset-normalizer>=3.0
14
+ python-docx>=1.2
15
+ openpyxl>=3.1
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
5
5
 
6
6
  setup(
7
7
  name='TeLLMgramBot',
8
- version='3.15.0',
8
+ version='3.15.2',
9
9
  packages=find_packages(),
10
10
  license='MIT',
11
11
  author='Digital Heresy',
@@ -28,6 +28,8 @@ setup(
28
28
  'pypdf>=6.0',
29
29
  'defusedxml>=0.7',
30
30
  'charset-normalizer>=3.0',
31
+ 'python-docx>=1.2',
32
+ 'openpyxl>=3.1',
31
33
  ],
32
34
  python_requires='>=3.10'
33
35
  )
File without changes
File without changes