TeLLMgramBot 3.15.0__tar.gz → 3.15.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/PKG-INFO +4 -2
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/README.md +1 -1
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/TeLLMgramBot.py +109 -55
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/archive.py +39 -8
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/initialize.py +9 -11
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/message_handlers.py +120 -13
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/PKG-INFO +4 -2
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/requires.txt +2 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/setup.py +3 -1
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/LICENSE +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/__init__.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/conversation.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/database.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/models.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/__init__.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/anthropic_provider.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/base.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/factory.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/providers/openai_provider.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/tools.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/utils.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot/web_utils.py +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/SOURCES.txt +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/dependency_links.txt +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/TeLLMgramBot.egg-info/top_level.txt +0 -0
- {tellmgrambot-3.15.0 → tellmgrambot-3.15.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: TeLLMgramBot
|
|
3
|
-
Version: 3.15.
|
|
3
|
+
Version: 3.15.2
|
|
4
4
|
Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
|
|
5
5
|
Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
|
|
6
6
|
Author: Digital Heresy
|
|
@@ -22,6 +22,8 @@ Requires-Dist: tzdata>=2025.2
|
|
|
22
22
|
Requires-Dist: pypdf>=6.0
|
|
23
23
|
Requires-Dist: defusedxml>=0.7
|
|
24
24
|
Requires-Dist: charset-normalizer>=3.0
|
|
25
|
+
Requires-Dist: python-docx>=1.2
|
|
26
|
+
Requires-Dist: openpyxl>=3.1
|
|
25
27
|
Dynamic: author
|
|
26
28
|
Dynamic: author-email
|
|
27
29
|
Dynamic: description
|
|
@@ -45,7 +47,7 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
45
47
|
* Example: "What do you think of this article? [https://some_site/article]"
|
|
46
48
|
* Uses a separate model (configurable via `url_model`) to handle larger URL content.
|
|
47
49
|
* Share documents and text files for analysis and summarisation.
|
|
48
|
-
* Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
|
|
50
|
+
* Supported formats: PDF (via pypdf), Microsoft Office documents (.docx via python-docx, .xlsx via openpyxl), plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML (via defusedxml).
|
|
49
51
|
* The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
|
|
50
52
|
* Can be disabled via `document_processing: false` in config.
|
|
51
53
|
* Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
|
|
@@ -10,7 +10,7 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
10
10
|
* Example: "What do you think of this article? [https://some_site/article]"
|
|
11
11
|
* Uses a separate model (configurable via `url_model`) to handle larger URL content.
|
|
12
12
|
* Share documents and text files for analysis and summarisation.
|
|
13
|
-
* Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
|
|
13
|
+
* Supported formats: PDF (via pypdf), Microsoft Office documents (.docx via python-docx, .xlsx via openpyxl), plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML (via defusedxml).
|
|
14
14
|
* The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
|
|
15
15
|
* Can be disabled via `document_processing: false` in config.
|
|
16
16
|
* Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
|
|
@@ -63,6 +63,22 @@ _MSG_FORGET_PROMPT = "Do you really want me to forget our memories together
|
|
|
63
63
|
_MSG_FORGET_COMPLETE = "Forget complete. Fresh start it is..."
|
|
64
64
|
_MSG_FORGET_CANCELLED = "Forget cancelled. Glad you changed your mind!"
|
|
65
65
|
|
|
66
|
+
|
|
67
|
+
def _validated_allow_local(value) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
Strictly validate the allow_local_webhooks config value, which gates an SSRF guard.
|
|
70
|
+
|
|
71
|
+
Only a literal `True` enables it; any other truthy non-bool (e.g. a quoted "false"
|
|
72
|
+
string) must not, so logs a warning and defaults to False.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
value: Raw `allow_local_webhooks` value from bot config.
|
|
76
|
+
"""
|
|
77
|
+
if value is not None and not isinstance(value, bool):
|
|
78
|
+
logger.warning(f"Invalid allow_local_webhooks '{value}' (must be true/false); defaulting to false")
|
|
79
|
+
return value is True
|
|
80
|
+
|
|
81
|
+
|
|
66
82
|
_SEARCH_TOOL = {
|
|
67
83
|
"name": "search_messages",
|
|
68
84
|
"description": (
|
|
@@ -926,17 +942,16 @@ class TelegramBot:
|
|
|
926
942
|
"""
|
|
927
943
|
Route Telegram document messages through the document summarisation pipeline.
|
|
928
944
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
friendly error before download.
|
|
945
|
+
Gates checked in order:
|
|
946
|
+
trigger - shared _resolve_group_trigger() (mention/nickname/initials/reply-to-bot,
|
|
947
|
+
incl. exclusive-foreign-mention yield); silent in channels, edited
|
|
948
|
+
messages, and untriggered group messages.
|
|
949
|
+
online - same self._online gate as tele_handle_response() (/start, /stop);
|
|
950
|
+
offline reply if down.
|
|
951
|
+
processing - document_processing config flag; friendly reply if disabled, but
|
|
952
|
+
only once triggered (untriggered group documents stay silent
|
|
953
|
+
regardless of the flag).
|
|
954
|
+
file size - friendly error over 20 MB, checked before download.
|
|
940
955
|
|
|
941
956
|
The user message stored in DB is '[Document: filename] caption'; document
|
|
942
957
|
bytes are never persisted. Respects is_private for cross-chat context isolation.
|
|
@@ -1228,6 +1243,39 @@ class TelegramBot:
|
|
|
1228
1243
|
"""Reply to unrecognized commands so the LLM never sees them."""
|
|
1229
1244
|
await update.message.reply_text("Unknown command. Use /help to see available commands.")
|
|
1230
1245
|
|
|
1246
|
+
async def _post_init(self, application: Application) -> None:
|
|
1247
|
+
"""
|
|
1248
|
+
Schedule archival and MCP tool discovery as background tasks once the polling
|
|
1249
|
+
event loop is live.
|
|
1250
|
+
|
|
1251
|
+
Registered as python-telegram-bot's post_init hook so a large archival backlog or slow/unreachable MCP
|
|
1252
|
+
servers never block tele_handle_message/tele_handle_document from answering the first incoming update.
|
|
1253
|
+
Uses application.create_task() rather than asyncio.create_task() so both tasks are tracked and
|
|
1254
|
+
cancelled cleanly on shutdown.
|
|
1255
|
+
"""
|
|
1256
|
+
if self._mcp_entries:
|
|
1257
|
+
application.create_task(self._discover_mcp_tools_background())
|
|
1258
|
+
application.create_task(run_archival(self.llm))
|
|
1259
|
+
|
|
1260
|
+
async def _discover_mcp_tools_background(self) -> None:
|
|
1261
|
+
"""
|
|
1262
|
+
Discover MCP tools and merge them into self.webhook_schemas/self.webhook_defs.
|
|
1263
|
+
|
|
1264
|
+
Runs as a background task scheduled by _post_init() so MCP server round-trips
|
|
1265
|
+
never block startup. Until this completes, owner-triggered tool calls simply see
|
|
1266
|
+
the webhook-only tool set; MCP tools become available once discovery finishes.
|
|
1267
|
+
"""
|
|
1268
|
+
try:
|
|
1269
|
+
existing_names = set(self.webhook_defs.keys()) | {'search_messages'}
|
|
1270
|
+
mcp_schemas, mcp_defs = await discover_mcp_tools(
|
|
1271
|
+
self._mcp_entries, existing_names, allow_local=self._allow_local_webhooks,
|
|
1272
|
+
)
|
|
1273
|
+
self.webhook_schemas = self.webhook_schemas + mcp_schemas
|
|
1274
|
+
self.webhook_defs = {**self.webhook_defs, **mcp_defs}
|
|
1275
|
+
logger.info(f"MCP discovery complete: {len(mcp_schemas)} tool(s) registered")
|
|
1276
|
+
except Exception:
|
|
1277
|
+
logger.error("Background MCP discovery failed", exc_info=True)
|
|
1278
|
+
|
|
1231
1279
|
def poll(self):
|
|
1232
1280
|
"""
|
|
1233
1281
|
Start the main polling loop for Telegram updates.
|
|
@@ -1240,14 +1288,14 @@ class TelegramBot:
|
|
|
1240
1288
|
|
|
1241
1289
|
# Initialization
|
|
1242
1290
|
def __init__(self,
|
|
1243
|
-
bot_owner
|
|
1244
|
-
bot_nickname
|
|
1245
|
-
bot_initials
|
|
1246
|
-
chat_model
|
|
1247
|
-
url_model
|
|
1248
|
-
token_limit
|
|
1249
|
-
search_limit
|
|
1250
|
-
persona_temp
|
|
1291
|
+
bot_owner = INIT_BOT_CONFIG['bot_owner'],
|
|
1292
|
+
bot_nickname = INIT_BOT_CONFIG['bot_nickname'],
|
|
1293
|
+
bot_initials = INIT_BOT_CONFIG['bot_initials'],
|
|
1294
|
+
chat_model = INIT_BOT_CONFIG['chat_model'],
|
|
1295
|
+
url_model = INIT_BOT_CONFIG['url_model'],
|
|
1296
|
+
token_limit = INIT_BOT_CONFIG['token_limit'],
|
|
1297
|
+
search_limit = INIT_BOT_CONFIG['search_limit'],
|
|
1298
|
+
persona_temp = INIT_BOT_CONFIG['persona_temp'],
|
|
1251
1299
|
archive_days = INIT_BOT_CONFIG['archive_days'],
|
|
1252
1300
|
document_processing = INIT_BOT_CONFIG['document_processing'],
|
|
1253
1301
|
persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
|
|
@@ -1255,6 +1303,8 @@ class TelegramBot:
|
|
|
1255
1303
|
instance_name: str | None = None,
|
|
1256
1304
|
webhook_schemas: list | None = None,
|
|
1257
1305
|
webhook_defs: dict | None = None,
|
|
1306
|
+
mcp_entries: list | None = None,
|
|
1307
|
+
allow_local_webhooks: bool = False,
|
|
1258
1308
|
):
|
|
1259
1309
|
"""
|
|
1260
1310
|
Initialize the Telegram bot with LLM configuration and API keys.
|
|
@@ -1280,6 +1330,10 @@ class TelegramBot:
|
|
|
1280
1330
|
If None, no webhook tools are registered.
|
|
1281
1331
|
webhook_defs: Resolved webhook tool definitions keyed by tool name (from build_tool_registry).
|
|
1282
1332
|
If None, no webhook tools are registered.
|
|
1333
|
+
mcp_entries: Raw 'mcp_server:' config entries (from the 'tools:' block), or None.
|
|
1334
|
+
_post_init() discovers these in the background and merges results into
|
|
1335
|
+
self.webhook_schemas/self.webhook_defs once discovery completes.
|
|
1336
|
+
allow_local_webhooks: Passed through to discover_mcp_tools() when MCP discovery runs.
|
|
1283
1337
|
|
|
1284
1338
|
Side Effects:
|
|
1285
1339
|
- Normalises bot_owner to list[str] and stores in self.telegram['owners'].
|
|
@@ -1296,7 +1350,9 @@ class TelegramBot:
|
|
|
1296
1350
|
self.token_warning = {} # Determines whether user has reached token limit by AI model
|
|
1297
1351
|
self.conversations = {} # Provides Conversation class per user based on bot response
|
|
1298
1352
|
self.webhook_schemas = webhook_schemas or [] # Provider-compatible schemas for webhook tools
|
|
1299
|
-
self.webhook_defs = webhook_defs or {}
|
|
1353
|
+
self.webhook_defs = webhook_defs or {} # Resolved tool definitions keyed by name
|
|
1354
|
+
self._mcp_entries = mcp_entries or [] # Raw mcp_server entries; discovered in _post_init()
|
|
1355
|
+
self._allow_local_webhooks = allow_local_webhooks
|
|
1300
1356
|
owners = bot_owner if isinstance(bot_owner, list) else [bot_owner]
|
|
1301
1357
|
self.telegram = {
|
|
1302
1358
|
'bot_id' : 0, # overwritten by _tele_info(); 0 is a safe sentinel
|
|
@@ -1319,7 +1375,12 @@ class TelegramBot:
|
|
|
1319
1375
|
loop.create_task(self._tele_info())
|
|
1320
1376
|
|
|
1321
1377
|
# Build our application with handlers for Commands, Messages, and Errors
|
|
1322
|
-
self.telegram['app'] =
|
|
1378
|
+
self.telegram['app'] = (
|
|
1379
|
+
Application.builder()
|
|
1380
|
+
.token(os.environ['TELLMGRAMBOT_TELEGRAM_API_KEY'])
|
|
1381
|
+
.post_init(self._post_init)
|
|
1382
|
+
.build()
|
|
1383
|
+
)
|
|
1323
1384
|
self.telegram['app'].add_handler(CommandHandler('help', self.tele_commands))
|
|
1324
1385
|
self.telegram['app'].add_handler(CommandHandler('start', self.tele_start_command))
|
|
1325
1386
|
self.telegram['app'].add_handler(CommandHandler('stop', self.tele_stop_command))
|
|
@@ -1391,9 +1452,10 @@ class TelegramBot:
|
|
|
1391
1452
|
|
|
1392
1453
|
Calls init_structure() to bootstrap directories, API keys, and configuration files,
|
|
1393
1454
|
unpacking a three-tuple (ApiKeyStatus, config dict, persona prompt str with system
|
|
1394
|
-
appendix already appended). Builds webhook tool registry from 'tools:' config
|
|
1395
|
-
|
|
1396
|
-
Applies defaults for any missing values and returns a fully
|
|
1455
|
+
appendix already appended). Builds the webhook tool registry from 'tools:' config and
|
|
1456
|
+
collects any 'mcp_server:' entries (passed through to TelegramBot for later discovery -
|
|
1457
|
+
see Side Effects). Applies defaults for any missing values and returns a fully
|
|
1458
|
+
initialized TelegramBot.
|
|
1397
1459
|
|
|
1398
1460
|
Args:
|
|
1399
1461
|
config_file: Filename of the bot configuration YAML (default: 'config.yaml').
|
|
@@ -1406,50 +1468,42 @@ class TelegramBot:
|
|
|
1406
1468
|
|
|
1407
1469
|
Side Effects:
|
|
1408
1470
|
- Calls init_structure() which creates directories, config/prompt files, and checks API keys.
|
|
1409
|
-
-
|
|
1410
|
-
|
|
1471
|
+
- Passes any 'mcp_server:' entries to TelegramBot as mcp_entries; discovery itself runs
|
|
1472
|
+
in the background via TelegramBot._post_init() once the polling event loop is live.
|
|
1473
|
+
- May log warnings (for missing config values or an empty prompt), but does not print a
|
|
1474
|
+
startup API key status summary.
|
|
1411
1475
|
- Log identity/file label is taken from bot config `instance_name` when set; otherwise defaults to the bot's Telegram username once _tele_info() resolves.
|
|
1412
1476
|
"""
|
|
1413
1477
|
# Bootstrap directories, logging, config, prompt (with appendix), and API keys in one call.
|
|
1414
1478
|
key_status, config, prompt = init_structure(config_file, prompt_file)
|
|
1415
1479
|
|
|
1416
1480
|
# Build the webhook tool registry from the optional 'tools:' block in bot config.
|
|
1417
|
-
allow_local = config['allow_local_webhooks']
|
|
1481
|
+
allow_local = _validated_allow_local(config['allow_local_webhooks'])
|
|
1418
1482
|
webhook_schemas, webhook_defs = build_tool_registry(config.get('tools') or [], allow_local)
|
|
1419
1483
|
|
|
1420
|
-
#
|
|
1484
|
+
# Raw mcp_server entries; TelegramBot._post_init() runs discovery in the background.
|
|
1421
1485
|
mcp_entries = [
|
|
1422
1486
|
e for e in (config.get('tools') or [])
|
|
1423
1487
|
if isinstance(e, dict) and 'mcp_server' in e
|
|
1424
1488
|
]
|
|
1425
|
-
if mcp_entries:
|
|
1426
|
-
existing_names = set(webhook_defs.keys()) | {'search_messages'}
|
|
1427
|
-
mcp_schemas, mcp_defs = [], {}
|
|
1428
|
-
try:
|
|
1429
|
-
asyncio.get_running_loop()
|
|
1430
|
-
logger.warning("MCP discovery skipped: set() called from within an async context.")
|
|
1431
|
-
except RuntimeError:
|
|
1432
|
-
mcp_schemas, mcp_defs = asyncio.run(discover_mcp_tools(
|
|
1433
|
-
mcp_entries, existing_names, allow_local=allow_local,
|
|
1434
|
-
))
|
|
1435
|
-
webhook_schemas = webhook_schemas + mcp_schemas
|
|
1436
|
-
webhook_defs = {**webhook_defs, **mcp_defs}
|
|
1437
1489
|
|
|
1438
1490
|
# Apply parameters to bot:
|
|
1439
1491
|
return TelegramBot(
|
|
1440
|
-
bot_owner
|
|
1441
|
-
bot_nickname
|
|
1442
|
-
bot_initials
|
|
1443
|
-
chat_model
|
|
1444
|
-
url_model
|
|
1445
|
-
token_limit
|
|
1446
|
-
search_limit
|
|
1447
|
-
persona_temp
|
|
1448
|
-
archive_days
|
|
1449
|
-
document_processing
|
|
1450
|
-
persona_prompt
|
|
1451
|
-
key_status
|
|
1452
|
-
instance_name
|
|
1453
|
-
webhook_schemas
|
|
1454
|
-
webhook_defs
|
|
1492
|
+
bot_owner = config['bot_owner'],
|
|
1493
|
+
bot_nickname = config['bot_nickname'],
|
|
1494
|
+
bot_initials = config['bot_initials'],
|
|
1495
|
+
chat_model = config['chat_model'],
|
|
1496
|
+
url_model = config['url_model'],
|
|
1497
|
+
token_limit = config['token_limit'],
|
|
1498
|
+
search_limit = config['search_limit'],
|
|
1499
|
+
persona_temp = config['persona_temp'],
|
|
1500
|
+
archive_days = config['archive_days'],
|
|
1501
|
+
document_processing = config['document_processing'],
|
|
1502
|
+
persona_prompt = prompt,
|
|
1503
|
+
key_status = key_status,
|
|
1504
|
+
instance_name = config['instance_name'],
|
|
1505
|
+
webhook_schemas = webhook_schemas,
|
|
1506
|
+
webhook_defs = webhook_defs,
|
|
1507
|
+
mcp_entries = mcp_entries,
|
|
1508
|
+
allow_local_webhooks = allow_local,
|
|
1455
1509
|
)
|
|
@@ -11,6 +11,7 @@ Tier 2: Episodic summarization - old Tier 1 rows compressed into thematic digest
|
|
|
11
11
|
Raw messages are never deleted; archived_at flags rows to skip during context loading.
|
|
12
12
|
Search still hits raw rows regardless of archived_at.
|
|
13
13
|
"""
|
|
14
|
+
import asyncio
|
|
14
15
|
import json
|
|
15
16
|
import logging
|
|
16
17
|
|
|
@@ -24,6 +25,10 @@ logger = logging.getLogger(__name__)
|
|
|
24
25
|
|
|
25
26
|
_archival_running = False
|
|
26
27
|
|
|
28
|
+
# Caps concurrent in-flight LLM calls per run_archival() invocation so a large
|
|
29
|
+
# backlog doesn't hammer the provider's rate limits.
|
|
30
|
+
_BATCH_CONCURRENCY = 5
|
|
31
|
+
|
|
27
32
|
_TIER1_PROMPT = (
|
|
28
33
|
"Extract key facts from this conversation. "
|
|
29
34
|
"Ignore greetings, acknowledgments, and filler. "
|
|
@@ -88,14 +93,34 @@ def _fmt_ts(ts: str) -> str:
|
|
|
88
93
|
return ts[:16].replace('T', ' ') + ' UTC'
|
|
89
94
|
|
|
90
95
|
|
|
96
|
+
async def _gather_bounded(coros) -> None:
|
|
97
|
+
"""
|
|
98
|
+
Run an iterable of coroutines with at most _BATCH_CONCURRENCY in flight at once.
|
|
99
|
+
|
|
100
|
+
Pulls coroutines from the iterable lazily via a fixed pool of _BATCH_CONCURRENCY
|
|
101
|
+
workers, rather than wrapping every coroutine in a Task up front - so a large
|
|
102
|
+
archival backlog never creates more pending tasks than the concurrency cap.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
coros: Iterable of coroutines to execute with bounded concurrency.
|
|
106
|
+
"""
|
|
107
|
+
coro_iter = iter(coros)
|
|
108
|
+
|
|
109
|
+
async def _worker():
|
|
110
|
+
for coro in coro_iter:
|
|
111
|
+
await coro
|
|
112
|
+
|
|
113
|
+
await asyncio.gather(*(_worker() for _ in range(_BATCH_CONCURRENCY)))
|
|
114
|
+
|
|
115
|
+
|
|
91
116
|
async def _run_tier1(config: dict) -> None:
|
|
92
117
|
"""
|
|
93
118
|
Extract key facts from messages older than archive_days into Tier 1 rows.
|
|
94
119
|
|
|
95
120
|
Groups old messages by chat and day, batches each group through the LLM with
|
|
96
121
|
_TIER1_PROMPT, stores the extracted facts as summary_archive rows, and flags
|
|
97
|
-
source rows with archived_at.
|
|
98
|
-
processing other batches.
|
|
122
|
+
source rows with archived_at. Batches run concurrently, bounded by _BATCH_CONCURRENCY.
|
|
123
|
+
Logs warnings on batch failures but continues processing other batches.
|
|
99
124
|
|
|
100
125
|
Args:
|
|
101
126
|
config: Dict with keys: chat_model, and optionally archive_days.
|
|
@@ -131,12 +156,15 @@ async def _run_tier1(config: dict) -> None:
|
|
|
131
156
|
batches.setdefault(key, []).append(row)
|
|
132
157
|
|
|
133
158
|
provider = get_provider(model)
|
|
134
|
-
|
|
159
|
+
|
|
160
|
+
async def _process(chat_id, day, batch):
|
|
135
161
|
try:
|
|
136
162
|
await _process_tier1_batch(provider, model, chat_id, day, batch)
|
|
137
163
|
except Exception as e:
|
|
138
164
|
logger.warning(f"ARCHIVE: Tier 1 batch {chat_id}/{day} failed: {e}")
|
|
139
165
|
|
|
166
|
+
await _gather_bounded(_process(chat_id, day, batch) for (chat_id, day), batch in batches.items())
|
|
167
|
+
|
|
140
168
|
|
|
141
169
|
async def _process_tier1_batch(provider, model: str, chat_id: int, day: str, rows: list) -> None:
|
|
142
170
|
"""
|
|
@@ -209,10 +237,10 @@ async def _run_tier2(config: dict) -> None:
|
|
|
209
237
|
"""
|
|
210
238
|
Compress old Tier 1 rows into Tier 2 (episodic) summaries.
|
|
211
239
|
|
|
212
|
-
Groups Tier 1 rows older than archive_days * 2 by chat and month, batches each
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
batch failures but continues processing other batches.
|
|
240
|
+
Groups Tier 1 rows older than archive_days * 2 by chat and month, batches each group through the LLM
|
|
241
|
+
with _TIER2_PROMPT, stores the result as a summary_archive row (tier 2), and flags source Tier 1 rows
|
|
242
|
+
with archived_at. Batches run concurrently, bounded by _BATCH_CONCURRENCY.
|
|
243
|
+
Logs warnings on batch failures but continues processing other batches.
|
|
216
244
|
|
|
217
245
|
Args:
|
|
218
246
|
config: Dict with keys: chat_model, and optionally archive_days.
|
|
@@ -246,12 +274,15 @@ async def _run_tier2(config: dict) -> None:
|
|
|
246
274
|
batches.setdefault(key, []).append(row)
|
|
247
275
|
|
|
248
276
|
provider = get_provider(model)
|
|
249
|
-
|
|
277
|
+
|
|
278
|
+
async def _process(chat_id, month, batch):
|
|
250
279
|
try:
|
|
251
280
|
await _process_tier2_batch(provider, model, chat_id, month, batch)
|
|
252
281
|
except Exception as e:
|
|
253
282
|
logger.warning(f"ARCHIVE: Tier 2 batch {chat_id}/{month} failed: {e}")
|
|
254
283
|
|
|
284
|
+
await _gather_bounded(_process(chat_id, month, batch) for (chat_id, month), batch in batches.items())
|
|
285
|
+
|
|
255
286
|
|
|
256
287
|
async def _process_tier2_batch(provider, model: str, chat_id: int, month: str, rows: list) -> None:
|
|
257
288
|
"""
|
|
@@ -13,7 +13,6 @@ from typing import Optional
|
|
|
13
13
|
|
|
14
14
|
from .utils import read_text, generate_file_path, read_yaml, execution_dir, generate_filename
|
|
15
15
|
from .database import set_db_filename, init_db, get_db_path
|
|
16
|
-
from .archive import run_archival
|
|
17
16
|
|
|
18
17
|
logger = logging.getLogger(__name__)
|
|
19
18
|
_logging_initialized = False
|
|
@@ -529,6 +528,9 @@ def init_structure(
|
|
|
529
528
|
the custom filename (only if the target does not yet exist). This ordering ensures the DB
|
|
530
529
|
filename is settled before schema init.
|
|
531
530
|
|
|
531
|
+
Archival (Tier 1 and Tier 2) runs as a background task scheduled by TelegramBot._post_init()
|
|
532
|
+
once the polling event loop is live, so large backlogs never block the bot's first message response.
|
|
533
|
+
|
|
532
534
|
Args:
|
|
533
535
|
config_file: Name of the bot configuration file (default: 'config.yaml').
|
|
534
536
|
Resolved relative to TELLMGRAMBOT_CONFIGS_PATH.
|
|
@@ -576,9 +578,9 @@ def init_structure(
|
|
|
576
578
|
else:
|
|
577
579
|
logger.debug(f"DB migration skipped: conversations.db not found at {legacy_path}")
|
|
578
580
|
|
|
579
|
-
# In the sync path (no running loop), run DB init now
|
|
580
|
-
# In the async path (running loop
|
|
581
|
-
#
|
|
581
|
+
# In the sync path (no running loop), run DB init now.
|
|
582
|
+
# In the async path (running loop - e.g. a caller awaiting init_structure() from
|
|
583
|
+
# inside an already-async context), DB init is deferred into a background task.
|
|
582
584
|
try:
|
|
583
585
|
loop = asyncio.get_running_loop()
|
|
584
586
|
_has_loop = True
|
|
@@ -593,18 +595,14 @@ def init_structure(
|
|
|
593
595
|
logger.warning(f"File '{prompt_file}' is empty, using default persona prompt.")
|
|
594
596
|
prompt = prompt.rstrip() + "\n\n" + _SYSTEM_APPENDIX
|
|
595
597
|
|
|
596
|
-
# Load API keys before running archival so the provider can authenticate.
|
|
597
598
|
key_status = init_keys(config)
|
|
598
599
|
|
|
599
600
|
if _has_loop:
|
|
600
|
-
async def
|
|
601
|
+
async def _init_db_task():
|
|
601
602
|
try:
|
|
602
603
|
await init_db()
|
|
603
|
-
await run_archival(config)
|
|
604
604
|
except Exception:
|
|
605
|
-
logger.error(
|
|
606
|
-
loop.create_task(
|
|
607
|
-
else:
|
|
608
|
-
asyncio.run(run_archival(config))
|
|
605
|
+
logger.error("Background startup DB initialization failed", exc_info=True)
|
|
606
|
+
loop.create_task(_init_db_task())
|
|
609
607
|
|
|
610
608
|
return (key_status, config, prompt)
|
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
import io
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
|
+
import zipfile
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Optional
|
|
7
8
|
|
|
8
9
|
from charset_normalizer import from_bytes as _cn_from_bytes
|
|
9
10
|
import defusedxml.ElementTree as _defusedxml_ET
|
|
11
|
+
import docx
|
|
12
|
+
import openpyxl
|
|
10
13
|
import pypdf
|
|
11
14
|
|
|
12
15
|
from .utils import log_error
|
|
@@ -56,7 +59,13 @@ _PLAIN_TEXT_EXTENSIONS = frozenset({
|
|
|
56
59
|
|
|
57
60
|
_HTML_MIMES = frozenset({'text/html', 'application/xhtml+xml'})
|
|
58
61
|
_XML_MIMES = frozenset({'text/xml', 'application/xml'})
|
|
62
|
+
_DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
63
|
+
_XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
|
59
64
|
_PDF_PAGE_CAP = 100
|
|
65
|
+
_DOCX_LINE_CAP = 2000
|
|
66
|
+
_XLSX_ROW_CAP = 2000
|
|
67
|
+
_ZIP_ENTRY_SIZE_CAP = 100_000_000 # 100 MB uncompressed per entry - zip-bomb guard
|
|
68
|
+
_ZIP_TOTAL_SIZE_CAP = 500_000_000 # 500 MB uncompressed across all entries combined
|
|
60
69
|
|
|
61
70
|
|
|
62
71
|
def handle_greetings(text: str) -> Optional[str]:
|
|
@@ -87,6 +96,39 @@ def handle_common_queries(text: str) -> Optional[str]:
|
|
|
87
96
|
return None
|
|
88
97
|
|
|
89
98
|
|
|
99
|
+
def _zip_entries_within_cap(file_bytes: bytes) -> bool:
|
|
100
|
+
"""
|
|
101
|
+
Check declared uncompressed entry sizes in a zip-based file before decompression.
|
|
102
|
+
|
|
103
|
+
Guards against zip-bomb style resource exhaustion for .docx/.xlsx (both are zip
|
|
104
|
+
containers), without actually decompressing anything. Rejects a file if any single
|
|
105
|
+
entry's declared uncompressed size exceeds _ZIP_ENTRY_SIZE_CAP, or if the sum across
|
|
106
|
+
all entries exceeds _ZIP_TOTAL_SIZE_CAP (catches many small entries that each pass
|
|
107
|
+
the per-entry cap but together still exhaust memory). Returns False for a corrupted/
|
|
108
|
+
non-zip file too, since python-docx/openpyxl would fail on it anyway.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
file_bytes: Raw file bytes to inspect.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
True if the file is a valid zip, every entry's declared size is within the
|
|
115
|
+
per-entry cap, and the total declared size is within the total cap; False
|
|
116
|
+
otherwise.
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
|
|
120
|
+
total = 0
|
|
121
|
+
for info in zf.infolist():
|
|
122
|
+
if info.file_size > _ZIP_ENTRY_SIZE_CAP:
|
|
123
|
+
return False
|
|
124
|
+
total += info.file_size
|
|
125
|
+
if total > _ZIP_TOTAL_SIZE_CAP:
|
|
126
|
+
return False
|
|
127
|
+
return True
|
|
128
|
+
except Exception:
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
|
|
90
132
|
def _decode_bytes(raw: bytes) -> tuple:
|
|
91
133
|
"""
|
|
92
134
|
Decode raw bytes to a string via UTF-8 -> charset-normalizer -> Latin-1 chain.
|
|
@@ -108,12 +150,16 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
|
|
|
108
150
|
"""
|
|
109
151
|
Extract plain text from document bytes, routing by MIME type and file extension.
|
|
110
152
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
153
|
+
Extraction by type:
|
|
154
|
+
PDF - pypdf, capped at _PDF_PAGE_CAP pages, strict=False.
|
|
155
|
+
.docx - python-docx; paragraphs + flattened table rows, capped at _DOCX_LINE_CAP combined lines.
|
|
156
|
+
.xlsx - openpyxl in read_only/data_only mode; capped at _XLSX_ROW_CAP rows total across all sheets.
|
|
157
|
+
.docx/.xlsx are zip containers, so _zip_entries_within_cap() rejects implausible declared
|
|
158
|
+
uncompressed entry sizes before either library decompresses anything (zip-bomb guard).
|
|
159
|
+
HTML - tags stripped via strip_html_markup.
|
|
160
|
+
XML - defusedxml (XXE-safe); falls back to plain-text decode if malformed.
|
|
161
|
+
other - UTF-8 -> charset-normalizer -> Latin-1 chain
|
|
162
|
+
non-UTF-8 files get a [File encoding: ...] prefix.
|
|
117
163
|
|
|
118
164
|
Args:
|
|
119
165
|
file_bytes: Raw document bytes downloaded from Telegram.
|
|
@@ -128,6 +174,8 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
|
|
|
128
174
|
ext = Path(filename).suffix.lower() if filename else ''
|
|
129
175
|
|
|
130
176
|
is_pdf = mime == 'application/pdf' or ext == '.pdf'
|
|
177
|
+
is_docx = ext == '.docx' or mime == _DOCX_MIME
|
|
178
|
+
is_xlsx = ext == '.xlsx' or mime == _XLSX_MIME
|
|
131
179
|
is_html = ext in ('.html', '.htm') or mime in _HTML_MIMES
|
|
132
180
|
is_xml = ext == '.xml' or mime in _XML_MIMES
|
|
133
181
|
is_plain = mime.startswith('text/') or ext in _PLAIN_TEXT_EXTENSIONS
|
|
@@ -145,6 +193,67 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
|
|
|
145
193
|
log_error(e, 'PDF')
|
|
146
194
|
return None, "Something went wrong while reading that PDF. Please try again."
|
|
147
195
|
|
|
196
|
+
if is_docx:
|
|
197
|
+
if not _zip_entries_within_cap(file_bytes):
|
|
198
|
+
return None, "Something went wrong while reading that document. Please try again."
|
|
199
|
+
try:
|
|
200
|
+
document = docx.Document(io.BytesIO(file_bytes))
|
|
201
|
+
lines = []
|
|
202
|
+
for p in document.paragraphs:
|
|
203
|
+
if len(lines) >= _DOCX_LINE_CAP:
|
|
204
|
+
break
|
|
205
|
+
if p.text:
|
|
206
|
+
lines.append(p.text)
|
|
207
|
+
for table in document.tables:
|
|
208
|
+
if len(lines) >= _DOCX_LINE_CAP:
|
|
209
|
+
break
|
|
210
|
+
for row in table.rows:
|
|
211
|
+
if len(lines) >= _DOCX_LINE_CAP:
|
|
212
|
+
break
|
|
213
|
+
row_text = '\t'.join(cell.text for cell in row.cells)
|
|
214
|
+
if row_text.strip():
|
|
215
|
+
lines.append(row_text)
|
|
216
|
+
text = '\n'.join(lines)
|
|
217
|
+
if not text.strip():
|
|
218
|
+
return None, "This document appears to have no readable text in it."
|
|
219
|
+
return text, None
|
|
220
|
+
except Exception as e:
|
|
221
|
+
log_error(e, 'DOCX')
|
|
222
|
+
return None, "Something went wrong while reading that document. Please try again."
|
|
223
|
+
|
|
224
|
+
if is_xlsx:
|
|
225
|
+
if not _zip_entries_within_cap(file_bytes):
|
|
226
|
+
return None, "Something went wrong while reading that spreadsheet. Please try again."
|
|
227
|
+
workbook = None
|
|
228
|
+
try:
|
|
229
|
+
workbook = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
|
|
230
|
+
lines = []
|
|
231
|
+
total_rows = 0
|
|
232
|
+
for sheet in workbook.worksheets:
|
|
233
|
+
if total_rows >= _XLSX_ROW_CAP:
|
|
234
|
+
break
|
|
235
|
+
sheet_lines = []
|
|
236
|
+
for row in sheet.iter_rows(values_only=True):
|
|
237
|
+
if total_rows >= _XLSX_ROW_CAP:
|
|
238
|
+
break
|
|
239
|
+
values = ['' if v is None else str(v) for v in row]
|
|
240
|
+
if any(values):
|
|
241
|
+
sheet_lines.append('\t'.join(values))
|
|
242
|
+
total_rows += 1
|
|
243
|
+
if sheet_lines:
|
|
244
|
+
lines.append(f"## Sheet: {sheet.title}")
|
|
245
|
+
lines.extend(sheet_lines)
|
|
246
|
+
text = '\n'.join(lines)
|
|
247
|
+
if not text.strip():
|
|
248
|
+
return None, "This spreadsheet appears to have no readable data in it."
|
|
249
|
+
return text, None
|
|
250
|
+
except Exception as e:
|
|
251
|
+
log_error(e, 'XLSX')
|
|
252
|
+
return None, "Something went wrong while reading that spreadsheet. Please try again."
|
|
253
|
+
finally:
|
|
254
|
+
if workbook is not None:
|
|
255
|
+
workbook.close()
|
|
256
|
+
|
|
148
257
|
if is_html:
|
|
149
258
|
raw_text, _ = _decode_bytes(file_bytes)
|
|
150
259
|
return strip_html_markup(raw_text), None
|
|
@@ -166,7 +275,7 @@ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) ->
|
|
|
166
275
|
text = f"[File encoding: {encoding}]\n{text}"
|
|
167
276
|
return text, None
|
|
168
277
|
|
|
169
|
-
return None, "I can only read
|
|
278
|
+
return None, "I can only read PDF, Word documents (.docx), spreadsheets (.xlsx), HTML, XML, and plain text files right now."
|
|
170
279
|
|
|
171
280
|
|
|
172
281
|
async def summarise_text(
|
|
@@ -179,12 +288,10 @@ async def summarise_text(
|
|
|
179
288
|
"""
|
|
180
289
|
Token-prune content, apply template, and complete via the LLM.
|
|
181
290
|
|
|
182
|
-
Prunes content so the fully composed
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
a large template or persona prompt is accounted for, not just the content itself. The
|
|
187
|
-
template must contain a {content} placeholder.
|
|
291
|
+
Prunes content so the fully composed message (prompt + template + content) fits the
|
|
292
|
+
model's token budget (max_tokens - 500), measuring the composed message at every
|
|
293
|
+
pruning step - not just raw content - so the budget guarantee matches what's actually
|
|
294
|
+
sent. The template must contain a {content} placeholder.
|
|
188
295
|
|
|
189
296
|
Args:
|
|
190
297
|
content: Text content to summarise (URL body or document text).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: TeLLMgramBot
|
|
3
|
-
Version: 3.15.
|
|
3
|
+
Version: 3.15.2
|
|
4
4
|
Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
|
|
5
5
|
Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
|
|
6
6
|
Author: Digital Heresy
|
|
@@ -22,6 +22,8 @@ Requires-Dist: tzdata>=2025.2
|
|
|
22
22
|
Requires-Dist: pypdf>=6.0
|
|
23
23
|
Requires-Dist: defusedxml>=0.7
|
|
24
24
|
Requires-Dist: charset-normalizer>=3.0
|
|
25
|
+
Requires-Dist: python-docx>=1.2
|
|
26
|
+
Requires-Dist: openpyxl>=3.1
|
|
25
27
|
Dynamic: author
|
|
26
28
|
Dynamic: author-email
|
|
27
29
|
Dynamic: description
|
|
@@ -45,7 +47,7 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
45
47
|
* Example: "What do you think of this article? [https://some_site/article]"
|
|
46
48
|
* Uses a separate model (configurable via `url_model`) to handle larger URL content.
|
|
47
49
|
* Share documents and text files for analysis and summarisation.
|
|
48
|
-
* Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
|
|
50
|
+
* Supported formats: PDF (via pypdf), Microsoft Office documents (.docx via python-docx, .xlsx via openpyxl), plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML (via defusedxml).
|
|
49
51
|
* The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
|
|
50
52
|
* Can be disabled via `document_processing: false` in config.
|
|
51
53
|
* Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name='TeLLMgramBot',
|
|
8
|
-
version='3.15.
|
|
8
|
+
version='3.15.2',
|
|
9
9
|
packages=find_packages(),
|
|
10
10
|
license='MIT',
|
|
11
11
|
author='Digital Heresy',
|
|
@@ -28,6 +28,8 @@ setup(
|
|
|
28
28
|
'pypdf>=6.0',
|
|
29
29
|
'defusedxml>=0.7',
|
|
30
30
|
'charset-normalizer>=3.0',
|
|
31
|
+
'python-docx>=1.2',
|
|
32
|
+
'openpyxl>=3.1',
|
|
31
33
|
],
|
|
32
34
|
python_requires='>=3.10'
|
|
33
35
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|