TeLLMgramBot 3.14.2__tar.gz → 3.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/PKG-INFO +9 -1
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/README.md +5 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/TeLLMgramBot.py +262 -107
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/initialize.py +2 -0
- tellmgrambot-3.15.0/TeLLMgramBot/message_handlers.py +316 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/tools.py +2 -2
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/PKG-INFO +9 -1
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/requires.txt +3 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/setup.py +5 -2
- tellmgrambot-3.14.2/TeLLMgramBot/message_handlers.py +0 -153
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/LICENSE +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/__init__.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/archive.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/conversation.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/database.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/models.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/__init__.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/anthropic_provider.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/base.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/factory.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/openai_provider.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/utils.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/web_utils.py +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/SOURCES.txt +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/dependency_links.txt +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/top_level.txt +0 -0
- {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: TeLLMgramBot
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.15.0
|
|
4
4
|
Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
|
|
5
5
|
Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
|
|
6
6
|
Author: Digital Heresy
|
|
@@ -19,6 +19,9 @@ Requires-Dist: tiktoken>=0.12
|
|
|
19
19
|
Requires-Dist: python-telegram-bot>=20.8
|
|
20
20
|
Requires-Dist: aiosqlite>=0.19
|
|
21
21
|
Requires-Dist: tzdata>=2025.2
|
|
22
|
+
Requires-Dist: pypdf>=6.0
|
|
23
|
+
Requires-Dist: defusedxml>=0.7
|
|
24
|
+
Requires-Dist: charset-normalizer>=3.0
|
|
22
25
|
Dynamic: author
|
|
23
26
|
Dynamic: author-email
|
|
24
27
|
Dynamic: description
|
|
@@ -41,6 +44,10 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
41
44
|
* Pass URLs in [square brackets] and mention how the bot should interpret them.
|
|
42
45
|
* Example: "What do you think of this article? [https://some_site/article]"
|
|
43
46
|
* Uses a separate model (configurable via `url_model`) to handle larger URL content.
|
|
47
|
+
* Share documents and text files for analysis and summarisation.
|
|
48
|
+
* Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
|
|
49
|
+
* The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
|
|
50
|
+
* Can be disabled via `document_processing: false` in config.
|
|
44
51
|
* Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
|
|
45
52
|
* Example: "Who said thanks for the breakdown?" or "What did George say about the project?" or "Show me the last few messages."
|
|
46
53
|
* All search filters (speaker, chat, date) are optional. Results are ordered most-recent-first. Configure `search_limit` to control how many results to return (default: 30).
|
|
@@ -157,6 +164,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
|
|
|
157
164
|
- `token_limit`: Max tokens (optional; defaults to model's maximum)
|
|
158
165
|
- `search_limit`: Max search results (optional; defaults to 30)
|
|
159
166
|
- `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
|
|
167
|
+
- `document_processing`: Optional bool (default: true). Set to false to disable document and text file summarisation.
|
|
160
168
|
- `allow_local_webhooks`: Set to `true` to permit webhook/MCP URLs targeting loopback or link-local addresses (optional; default `false`). Useful when tools like Home Assistant run on the same host.
|
|
161
169
|
- `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
|
|
162
170
|
4. **Disable group privacy mode in BotFather:**
|
|
@@ -9,6 +9,10 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
9
9
|
* Pass URLs in [square brackets] and mention how the bot should interpret them.
|
|
10
10
|
* Example: "What do you think of this article? [https://some_site/article]"
|
|
11
11
|
* Uses a separate model (configurable via `url_model`) to handle larger URL content.
|
|
12
|
+
* Share documents and text files for analysis and summarisation.
|
|
13
|
+
* Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
|
|
14
|
+
* The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
|
|
15
|
+
* Can be disabled via `document_processing: false` in config.
|
|
12
16
|
* Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
|
|
13
17
|
* Example: "Who said thanks for the breakdown?" or "What did George say about the project?" or "Show me the last few messages."
|
|
14
18
|
* All search filters (speaker, chat, date) are optional. Results are ordered most-recent-first. Configure `search_limit` to control how many results to return (default: 30).
|
|
@@ -125,6 +129,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
|
|
|
125
129
|
- `token_limit`: Max tokens (optional; defaults to model's maximum)
|
|
126
130
|
- `search_limit`: Max search results (optional; defaults to 30)
|
|
127
131
|
- `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
|
|
132
|
+
- `document_processing`: Optional bool (default: true). Set to false to disable document and text file summarisation.
|
|
128
133
|
- `allow_local_webhooks`: Set to `true` to permit webhook/MCP URLs targeting loopback or link-local addresses (optional; default `false`). Useful when tools like Home Assistant run on the same host.
|
|
129
134
|
- `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
|
|
130
135
|
4. **Disable group privacy mode in BotFather:**
|
|
@@ -40,7 +40,7 @@ from .initialize import (
|
|
|
40
40
|
bind_log_identity,
|
|
41
41
|
init_structure,
|
|
42
42
|
)
|
|
43
|
-
from .message_handlers import handle_greetings, handle_common_queries, handle_url_ask
|
|
43
|
+
from .message_handlers import handle_greetings, handle_common_queries, handle_url_ask, handle_document_message
|
|
44
44
|
from .models import TokenLimits
|
|
45
45
|
from .tools import build_tool_registry, discover_mcp_tools, execute_mcp, execute_webhook
|
|
46
46
|
from .providers.factory import get_provider
|
|
@@ -50,16 +50,18 @@ from .utils import exact_word_match, log_error
|
|
|
50
50
|
logger = logging.getLogger(__name__)
|
|
51
51
|
|
|
52
52
|
# Dialog copy - centralised so tests never hard-code these strings
|
|
53
|
-
_MSG_ADMIN_ONLY
|
|
54
|
-
_MSG_PROCESS_ERROR
|
|
55
|
-
_MSG_TOOL_RESULT_ERROR
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
53
|
+
_MSG_ADMIN_ONLY = "Sorry, I can't do that for you."
|
|
54
|
+
_MSG_PROCESS_ERROR = "Sorry, I couldn't process your message! Please contact my creator."
|
|
55
|
+
_MSG_TOOL_RESULT_ERROR = "Sorry, I couldn't process the tool result."
|
|
56
|
+
_MSG_DOC_PROCESSING_OFF = "Sorry, I can't process documents right now."
|
|
57
|
+
_MSG_OFFLINE = "I'd love to chat, but I am offline at the moment!"
|
|
58
|
+
_MSG_NOT_YOUR_PROMPT = "Sorry, this prompt is not for you!"
|
|
59
|
+
_MSG_WIPE_PROMPT = "ALL of my memories will be lost! Are you sure?"
|
|
60
|
+
_MSG_WIPE_COMPLETE = "Wipe complete. I hope you won't regret this..."
|
|
61
|
+
_MSG_WIPE_CANCELLED = "Wipe cancelled. Whew, you scared me for a moment!"
|
|
62
|
+
_MSG_FORGET_PROMPT = "Do you really want me to forget our memories together?"
|
|
63
|
+
_MSG_FORGET_COMPLETE = "Forget complete. Fresh start it is..."
|
|
64
|
+
_MSG_FORGET_CANCELLED = "Forget cancelled. Glad you changed your mind!"
|
|
63
65
|
|
|
64
66
|
_SEARCH_TOOL = {
|
|
65
67
|
"name": "search_messages",
|
|
@@ -409,6 +411,44 @@ class TelegramBot:
|
|
|
409
411
|
"from group conversation contexts. Use /private off to disable."
|
|
410
412
|
)
|
|
411
413
|
|
|
414
|
+
async def _get_or_load_conversation(
|
|
415
|
+
self, chat_id: int, chat_type: str, chat_title: str | None, user_id: int,
|
|
416
|
+
) -> Conversation:
|
|
417
|
+
"""
|
|
418
|
+
Get the Conversation for chat_id, creating it if new, and load/refresh the user's context.
|
|
419
|
+
|
|
420
|
+
Creates a new Conversation keyed by chat_id on first use, refreshes its "Current date
|
|
421
|
+
and time" line, then loads the user's cross-chat history on first appearance this
|
|
422
|
+
session (get_past_interaction) or checks for new messages since the last load
|
|
423
|
+
(refresh_user_context).
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
chat_id: Telegram chat ID.
|
|
427
|
+
chat_type: 'private', 'group', or 'supergroup'.
|
|
428
|
+
chat_title: Chat title, or None for private chats.
|
|
429
|
+
user_id: Telegram user ID triggering this message.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
The active Conversation for this chat.
|
|
433
|
+
"""
|
|
434
|
+
if chat_id not in self.conversations:
|
|
435
|
+
self.conversations[chat_id] = Conversation(
|
|
436
|
+
chat_id, chat_type, self.llm['prompt'], self.llm['chat_model'], chat_title,
|
|
437
|
+
)
|
|
438
|
+
conv = self.conversations[chat_id]
|
|
439
|
+
conv.update_datetime()
|
|
440
|
+
|
|
441
|
+
token_budget = floor(self.llm['prune_threshold'] / 2)
|
|
442
|
+
if user_id not in conv._context_cursor:
|
|
443
|
+
# First appearance of this user in this session - load their cross-chat history.
|
|
444
|
+
# Pass bot_id so private chats can also load shared group context.
|
|
445
|
+
# If no history exists yet, the cursor stays unset so the next message retries.
|
|
446
|
+
await conv.get_past_interaction(token_budget, user_id, self.telegram['bot_id'])
|
|
447
|
+
else:
|
|
448
|
+
# Already loaded - check for new cross-chat messages since last load.
|
|
449
|
+
await conv.refresh_user_context(user_id, token_budget)
|
|
450
|
+
return conv
|
|
451
|
+
|
|
412
452
|
async def tele_handle_response(self, text: str, msg: Message) -> tuple[str, int | None]:
|
|
413
453
|
"""
|
|
414
454
|
Primary function for handling any response including Generative AI, ensuring:
|
|
@@ -448,7 +488,7 @@ class TelegramBot:
|
|
|
448
488
|
"""
|
|
449
489
|
# Starting ensures we get some kind of user account details for logging
|
|
450
490
|
if not self._online:
|
|
451
|
-
return
|
|
491
|
+
return _MSG_OFFLINE, None
|
|
452
492
|
|
|
453
493
|
# Extract identity and context from the message
|
|
454
494
|
user_id = msg.from_user.id
|
|
@@ -461,31 +501,7 @@ class TelegramBot:
|
|
|
461
501
|
identity = f"@{username}" if username else ' '.join(filter(None, [first_name, last_name]))
|
|
462
502
|
logger.info(f"User {user_id} ({identity}) in {chat_type} Chat {chat_id}{f' ({chat_title})' if chat_title else ''}")
|
|
463
503
|
|
|
464
|
-
|
|
465
|
-
if chat_id not in self.conversations:
|
|
466
|
-
self.conversations[chat_id] = Conversation(
|
|
467
|
-
chat_id,
|
|
468
|
-
chat_type,
|
|
469
|
-
self.llm['prompt'],
|
|
470
|
-
self.llm['chat_model'],
|
|
471
|
-
chat_title,
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
conv = self.conversations[chat_id]
|
|
475
|
-
|
|
476
|
-
# Refresh datetime on every message
|
|
477
|
-
conv.update_datetime()
|
|
478
|
-
|
|
479
|
-
token_budget = floor(self.llm['prune_threshold'] / 2)
|
|
480
|
-
|
|
481
|
-
if user_id not in conv._context_cursor:
|
|
482
|
-
# First appearance of this user in this session - load their cross-chat history.
|
|
483
|
-
# Pass bot_id so private chats can also load shared group context.
|
|
484
|
-
# If no history exists yet, the cursor stays unset so the next message retries.
|
|
485
|
-
await conv.get_past_interaction(token_budget, user_id, self.telegram['bot_id'])
|
|
486
|
-
else:
|
|
487
|
-
# Already loaded - check for new cross-chat messages since last load.
|
|
488
|
-
await conv.refresh_user_context(user_id, token_budget)
|
|
504
|
+
conv = await self._get_or_load_conversation(chat_id, chat_type, chat_title, user_id)
|
|
489
505
|
|
|
490
506
|
# Surface the replied-to message into context before adding the triggering message.
|
|
491
507
|
await self._surface_replied_to_message(msg, conv)
|
|
@@ -695,29 +711,32 @@ class TelegramBot:
|
|
|
695
711
|
)
|
|
696
712
|
await prune_bot_messages(msg.chat.id)
|
|
697
713
|
|
|
698
|
-
def _exclusive_foreign_mention(self, msg: Message) -> str | None:
|
|
714
|
+
def _exclusive_foreign_mention(self, msg: Message, caption: bool = False) -> str | None:
|
|
699
715
|
"""
|
|
700
716
|
Return the first foreign @mention if all @mention entities are exclusively foreign.
|
|
701
717
|
|
|
702
718
|
Used in the reply-to-bot path: when the user threads Kowi's message but addresses
|
|
703
719
|
a different account via @mention, return that account's username so the caller can
|
|
704
720
|
detect a redirect. Returns None when we are also @mentioned (co-mention),
|
|
705
|
-
when there are no @mention entities, or when
|
|
721
|
+
when there are no @mention entities, or when the relevant entities list is absent.
|
|
706
722
|
|
|
707
723
|
Args:
|
|
708
724
|
msg: The incoming Telegram Message object.
|
|
725
|
+
caption: True to read msg.caption_entities (document captions) instead of
|
|
726
|
+
msg.entities (text messages).
|
|
709
727
|
|
|
710
728
|
Returns:
|
|
711
729
|
The first foreign @username string (with leading @) if all @mentions are
|
|
712
730
|
foreign, or None if we are co-mentioned or no @mention entities exist.
|
|
713
731
|
"""
|
|
714
|
-
if
|
|
732
|
+
entities = msg.caption_entities if caption else msg.entities
|
|
733
|
+
if not entities:
|
|
715
734
|
return None
|
|
716
735
|
our_username = self.telegram['username'].lower()
|
|
717
736
|
first_foreign = None
|
|
718
|
-
for entity in
|
|
737
|
+
for entity in entities:
|
|
719
738
|
if entity.type == MessageEntity.MENTION:
|
|
720
|
-
mentioned = msg.parse_entity(entity)
|
|
739
|
+
mentioned = msg.parse_caption_entity(entity) if caption else msg.parse_entity(entity)
|
|
721
740
|
if mentioned.lstrip('@').lower() == our_username:
|
|
722
741
|
return None
|
|
723
742
|
if first_foreign is None:
|
|
@@ -747,6 +766,90 @@ class TelegramBot:
|
|
|
747
766
|
except (TelegramError, AttributeError):
|
|
748
767
|
await msg.reply_text("Got it!")
|
|
749
768
|
|
|
769
|
+
async def _resolve_group_trigger(
|
|
770
|
+
self, msg: Message, text: str, context: ContextTypes.DEFAULT_TYPE, is_caption: bool = False,
|
|
771
|
+
) -> str | None:
|
|
772
|
+
"""
|
|
773
|
+
Apply group/supergroup trigger rules shared by text messages and document captions.
|
|
774
|
+
|
|
775
|
+
Checked in this order:
|
|
776
|
+
1. Exclusive foreign mention in a reply-to-foreign-bot thread - yields unconditionally,
|
|
777
|
+
taking precedence over @username, nickname/initials, and reply-to-bot signals.
|
|
778
|
+
2. @username mention - strips the mention from text; responds even if a foreign bot
|
|
779
|
+
is also @mentioned.
|
|
780
|
+
3. Nickname or initials mention - engages unconditionally.
|
|
781
|
+
4. Reply-to-bot - weaker signal; yields if exclusively addressed to another account
|
|
782
|
+
via @mention.
|
|
783
|
+
5. None of the above - not triggered.
|
|
784
|
+
Sends a read receipt via _send_read_receipt() before returning text for any triggered path.
|
|
785
|
+
|
|
786
|
+
Args:
|
|
787
|
+
msg: The Telegram Message in the group/supergroup chat.
|
|
788
|
+
text: msg.text or msg.caption - the content to evaluate for trigger words.
|
|
789
|
+
context: The Telegram context, passed through to _send_read_receipt().
|
|
790
|
+
is_caption: True when text is a document caption, so @mention entities are
|
|
791
|
+
read from msg.caption_entities instead of msg.entities.
|
|
792
|
+
|
|
793
|
+
Returns:
|
|
794
|
+
The text to process (username mention stripped if matched), or None if untriggered.
|
|
795
|
+
"""
|
|
796
|
+
is_reply_to_bot = (
|
|
797
|
+
msg.reply_to_message is not None and
|
|
798
|
+
msg.reply_to_message.from_user is not None and
|
|
799
|
+
msg.reply_to_message.from_user.id == self.telegram['bot_id']
|
|
800
|
+
)
|
|
801
|
+
is_reply_to_foreign_bot = (
|
|
802
|
+
msg.reply_to_message is not None and
|
|
803
|
+
msg.reply_to_message.from_user is not None and
|
|
804
|
+
msg.reply_to_message.from_user.is_bot and
|
|
805
|
+
msg.reply_to_message.from_user.id != self.telegram['bot_id']
|
|
806
|
+
)
|
|
807
|
+
if is_reply_to_foreign_bot and self._exclusive_foreign_mention(msg, is_caption):
|
|
808
|
+
return None
|
|
809
|
+
if exact_word_match(self.telegram['username'], text):
|
|
810
|
+
pattern = r'@?\b' + re.escape(self.telegram['username']) + r'\b'
|
|
811
|
+
text = re.sub(pattern, '', text).strip()
|
|
812
|
+
elif (
|
|
813
|
+
exact_word_match(self.telegram['nickname'], text) or
|
|
814
|
+
exact_word_match(self.telegram['initials'], text)
|
|
815
|
+
):
|
|
816
|
+
pass
|
|
817
|
+
elif is_reply_to_bot:
|
|
818
|
+
if self._exclusive_foreign_mention(msg, is_caption):
|
|
819
|
+
return None
|
|
820
|
+
else:
|
|
821
|
+
return None
|
|
822
|
+
await self._send_read_receipt(msg, context)
|
|
823
|
+
return text
|
|
824
|
+
|
|
825
|
+
async def _send_chunked_reply(
|
|
826
|
+
self, msg: Message, text: str, conv: Conversation | None = None, assistant_db_id: int | None = None,
|
|
827
|
+
) -> None:
|
|
828
|
+
"""
|
|
829
|
+
Split text into Telegram-sized chunks and send them sequentially.
|
|
830
|
+
|
|
831
|
+
Tracks each sent chunk's Telegram message ID in conv._loaded_message_ids (if conv is
|
|
832
|
+
given) and persists the last chunk's ID to assistant_db_id via update_message_tg_id
|
|
833
|
+
(if given), so cross-session tier 2 dedup can find the bot's own reply. A small pause
|
|
834
|
+
between sends reduces risk of Telegram rate limiting.
|
|
835
|
+
|
|
836
|
+
Args:
|
|
837
|
+
msg: The Telegram Message to reply to.
|
|
838
|
+
text: The full response text to chunk and send.
|
|
839
|
+
conv: The active Conversation, if any, for in-session message ID tracking.
|
|
840
|
+
assistant_db_id: The DB row id of the stored assistant message, if any.
|
|
841
|
+
"""
|
|
842
|
+
chunk_length = MessageLimit.MAX_TEXT_LENGTH - 1
|
|
843
|
+
chunks = [text[i:i+chunk_length] for i in range(0, len(text), chunk_length)]
|
|
844
|
+
for chunk in chunks:
|
|
845
|
+
sent = await msg.reply_text(chunk)
|
|
846
|
+
if sent:
|
|
847
|
+
if conv:
|
|
848
|
+
conv._loaded_message_ids.add(sent.message_id)
|
|
849
|
+
if assistant_db_id:
|
|
850
|
+
await update_message_tg_id(assistant_db_id, sent.message_id)
|
|
851
|
+
await asyncio.sleep(0.12) # small pause to reduce risk of rate limiting
|
|
852
|
+
|
|
750
853
|
async def tele_handle_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
|
751
854
|
"""
|
|
752
855
|
Route incoming Telegram messages to appropriate handlers based on chat type and trigger conditions.
|
|
@@ -804,66 +907,114 @@ class TelegramBot:
|
|
|
804
907
|
response = _MSG_PROCESS_ERROR
|
|
805
908
|
assistant_db_id = None
|
|
806
909
|
if chat.type == 'supergroup' or chat.type == 'group':
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
msg.reply_to_message.from_user is not None and
|
|
810
|
-
msg.reply_to_message.from_user.id == self.telegram['bot_id']
|
|
811
|
-
)
|
|
812
|
-
is_reply_to_foreign_bot = (
|
|
813
|
-
msg.reply_to_message is not None and
|
|
814
|
-
msg.reply_to_message.from_user is not None and
|
|
815
|
-
msg.reply_to_message.from_user.is_bot and
|
|
816
|
-
msg.reply_to_message.from_user.id != self.telegram['bot_id']
|
|
817
|
-
)
|
|
818
|
-
# In a foreign-bot reply thread, an exclusive @mention of another account
|
|
819
|
-
# takes absolute precedence over any nickname/initials match in the text.
|
|
820
|
-
if is_reply_to_foreign_bot and self._exclusive_foreign_mention(msg):
|
|
821
|
-
return
|
|
822
|
-
if exact_word_match(self.telegram['username'], msg.text):
|
|
823
|
-
# Explicit @username mention: strongest signal - respond even if another
|
|
824
|
-
# bot is also @mentioned (both may be intentionally addressed).
|
|
825
|
-
pattern = r'@?\b' + re.escape(self.telegram['username']) + r'\b'
|
|
826
|
-
new_text = re.sub(pattern, '', msg.text).strip()
|
|
827
|
-
await self._send_read_receipt(msg, context)
|
|
828
|
-
response, assistant_db_id = await self.tele_handle_response(new_text, msg)
|
|
829
|
-
elif (
|
|
830
|
-
exact_word_match(self.telegram['nickname'], msg.text) or
|
|
831
|
-
exact_word_match(self.telegram['initials'], msg.text)
|
|
832
|
-
):
|
|
833
|
-
# Nickname/initials: always engage - no reliable way to distinguish
|
|
834
|
-
# our name as addressee vs topic from text position alone.
|
|
835
|
-
await self._send_read_receipt(msg, context)
|
|
836
|
-
response, assistant_db_id = await self.tele_handle_response(msg.text, msg)
|
|
837
|
-
elif is_reply_to_bot:
|
|
838
|
-
# Reply-to-bot: weaker signal - yield silently if the message is
|
|
839
|
-
# exclusively addressed to a foreign account via @mention.
|
|
840
|
-
if self._exclusive_foreign_mention(msg):
|
|
841
|
-
return
|
|
842
|
-
await self._send_read_receipt(msg, context)
|
|
843
|
-
response, assistant_db_id = await self.tele_handle_response(msg.text, msg)
|
|
844
|
-
else:
|
|
910
|
+
triggered_text = await self._resolve_group_trigger(msg, msg.text, context)
|
|
911
|
+
if triggered_text is None:
|
|
845
912
|
return
|
|
913
|
+
response, assistant_db_id = await self.tele_handle_response(triggered_text, msg)
|
|
846
914
|
elif chat.type == 'private':
|
|
847
915
|
response, assistant_db_id = await self.tele_handle_response(msg.text, msg)
|
|
848
916
|
else:
|
|
849
917
|
return
|
|
850
918
|
|
|
851
|
-
#
|
|
852
|
-
|
|
853
|
-
|
|
919
|
+
# Persist this chunk's Telegram message ID on the assistant row. Each call overwrites
|
|
920
|
+
# the previous, so only the last chunk's ID is retained for cross-session tier 2 dedup.
|
|
921
|
+
# Tier 1 covers all chunks in-session via _loaded_message_ids.
|
|
854
922
|
conv = self.conversations.get(msg.chat.id)
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
923
|
+
await self._send_chunked_reply(msg, response, conv, assistant_db_id)
|
|
924
|
+
|
|
925
|
+
async def tele_handle_document(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
|
|
926
|
+
"""
|
|
927
|
+
Route Telegram document messages through the document summarisation pipeline.
|
|
928
|
+
|
|
929
|
+
Group trigger conditions (caption @mention, nickname/initials match, or reply-to-bot)
|
|
930
|
+
are resolved via the shared _resolve_group_trigger() also used by tele_handle_message,
|
|
931
|
+
including the exclusive-foreign-mention yield on reply-to-bot threads. Silently ignores
|
|
932
|
+
documents in channels, edited messages, and in groups/supergroups where no trigger
|
|
933
|
+
condition matched. Once triggered, respects the same global online/offline gate as
|
|
934
|
+
tele_handle_response() (set via /start, /stop) - replies with the offline message
|
|
935
|
+
rather than processing while offline. When document_processing is disabled in config,
|
|
936
|
+
replies with a friendly message instead of processing - but only when the message was
|
|
937
|
+
otherwise triggered (private chat, or a matched group trigger); untriggered group
|
|
938
|
+
documents still yield silently regardless of the flag. Files over 20 MB receive a
|
|
939
|
+
friendly error before download.
|
|
940
|
+
|
|
941
|
+
The user message stored in DB is '[Document: filename] caption'; document
|
|
942
|
+
bytes are never persisted. Respects is_private for cross-chat context isolation.
|
|
943
|
+
|
|
944
|
+
Args:
|
|
945
|
+
update: The Telegram Update containing the document message.
|
|
946
|
+
context: The Telegram context for downloading files and sending replies.
|
|
947
|
+
"""
|
|
948
|
+
validated = await self.tele_validate(update)
|
|
949
|
+
if not validated:
|
|
950
|
+
return
|
|
951
|
+
(msg, chat, user) = validated
|
|
952
|
+
|
|
953
|
+
caption = msg.caption or ''
|
|
954
|
+
|
|
955
|
+
if chat.type in ('group', 'supergroup'):
|
|
956
|
+
triggered_caption = await self._resolve_group_trigger(msg, caption, context, is_caption=True)
|
|
957
|
+
if triggered_caption is None:
|
|
958
|
+
return
|
|
959
|
+
caption = triggered_caption
|
|
960
|
+
elif chat.type != 'private':
|
|
961
|
+
return
|
|
962
|
+
|
|
963
|
+
if not self._online:
|
|
964
|
+
await msg.reply_text(_MSG_OFFLINE)
|
|
965
|
+
return
|
|
966
|
+
|
|
967
|
+
if not self.llm.get('document_processing', True):
|
|
968
|
+
await msg.reply_text(_MSG_DOC_PROCESSING_OFF)
|
|
969
|
+
return
|
|
970
|
+
|
|
971
|
+
if msg.document.file_size and msg.document.file_size > 20_000_000:
|
|
972
|
+
await msg.reply_text("That file is too large for me to read - please keep it under 20 MB.")
|
|
973
|
+
return
|
|
974
|
+
|
|
975
|
+
chat_id = chat.id
|
|
976
|
+
chat_type = chat.type
|
|
977
|
+
user_id = user.id
|
|
978
|
+
username = user.username
|
|
979
|
+
first_name = user.first_name or ''
|
|
980
|
+
last_name = user.last_name or ''
|
|
981
|
+
|
|
982
|
+
conv = await self._get_or_load_conversation(chat_id, chat_type, chat.title, user_id)
|
|
983
|
+
|
|
984
|
+
user_private_mode = await get_private_mode(user_id)
|
|
985
|
+
is_private = (chat_type == 'private') and user_private_mode
|
|
986
|
+
|
|
987
|
+
filename = msg.document.file_name or 'document'
|
|
988
|
+
user_text = f"[Document: {filename}] {caption}".strip() if caption else f"[Document: {filename}]"
|
|
989
|
+
user_msg_id = await conv.add_user_message(
|
|
990
|
+
user_text, user_id, username, first_name, last_name, is_private, msg.message_id
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
await msg.reply_text("Sure, give me a moment to read that...")
|
|
994
|
+
|
|
995
|
+
doc_file = await context.bot.get_file(msg.document.file_id)
|
|
996
|
+
file_bytes = bytes(await doc_file.download_as_bytearray())
|
|
997
|
+
mime_type = msg.document.mime_type or ''
|
|
998
|
+
|
|
999
|
+
reply = await handle_document_message(
|
|
1000
|
+
file_bytes, mime_type, filename, caption, self.llm['url_model'], conv.system_content
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
assistant_db_id = await conv.add_assistant_message(
|
|
1004
|
+
reply,
|
|
1005
|
+
self.telegram['bot_id'],
|
|
1006
|
+
self.telegram['username'],
|
|
1007
|
+
self.telegram['first_name'],
|
|
1008
|
+
self.telegram['last_name'],
|
|
1009
|
+
is_private,
|
|
1010
|
+
user_msg_id,
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
token_count = await conv.get_message_token_count()
|
|
1014
|
+
if token_count > self.llm['prune_threshold']:
|
|
1015
|
+
await conv.prune_conversation(self.llm['prune_back_to'])
|
|
1016
|
+
|
|
1017
|
+
await self._send_chunked_reply(msg, reply, conv, assistant_db_id)
|
|
867
1018
|
|
|
868
1019
|
async def tele_validate(self, update: Update) -> tuple[Message, Chat, User] | None:
|
|
869
1020
|
"""
|
|
@@ -1097,8 +1248,9 @@ class TelegramBot:
|
|
|
1097
1248
|
token_limit = INIT_BOT_CONFIG['token_limit'],
|
|
1098
1249
|
search_limit = INIT_BOT_CONFIG['search_limit'],
|
|
1099
1250
|
persona_temp = INIT_BOT_CONFIG['persona_temp'],
|
|
1100
|
-
archive_days
|
|
1101
|
-
|
|
1251
|
+
archive_days = INIT_BOT_CONFIG['archive_days'],
|
|
1252
|
+
document_processing = INIT_BOT_CONFIG['document_processing'],
|
|
1253
|
+
persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
|
|
1102
1254
|
key_status: ApiKeyStatus | None = None,
|
|
1103
1255
|
instance_name: str | None = None,
|
|
1104
1256
|
webhook_schemas: list | None = None,
|
|
@@ -1180,6 +1332,7 @@ class TelegramBot:
|
|
|
1180
1332
|
self.telegram['app'].add_handler(CommandHandler('private', self.tele_private_command))
|
|
1181
1333
|
self.telegram['app'].add_handler(MessageHandler(filters.COMMAND, self.tele_unknown_command))
|
|
1182
1334
|
self.telegram['app'].add_handler(MessageHandler(filters.TEXT & ~filters.UpdateType.EDITED_MESSAGE, self.tele_handle_message))
|
|
1335
|
+
self.telegram['app'].add_handler(MessageHandler(filters.Document.ALL & ~filters.UpdateType.EDITED_MESSAGE, self.tele_handle_document))
|
|
1183
1336
|
self.telegram['app'].add_error_handler(self.tele_error)
|
|
1184
1337
|
|
|
1185
1338
|
# Validate optional config values before storing; warn and fall back to defaults on bad input
|
|
@@ -1199,14 +1352,15 @@ class TelegramBot:
|
|
|
1199
1352
|
# Get our LLM spun up with defaults if not defined by user input
|
|
1200
1353
|
# Tokens as integers measure the length of conversation messages
|
|
1201
1354
|
self.llm = {
|
|
1202
|
-
'prompt'
|
|
1203
|
-
'chat_model'
|
|
1204
|
-
'url_model'
|
|
1205
|
-
'token_limit'
|
|
1206
|
-
'search_limit'
|
|
1207
|
-
'temperature'
|
|
1208
|
-
'top_p'
|
|
1209
|
-
'archive_days'
|
|
1355
|
+
'prompt' : persona_prompt,
|
|
1356
|
+
'chat_model' : chat_model,
|
|
1357
|
+
'url_model' : url_model,
|
|
1358
|
+
'token_limit' : token_limit or TokenLimits(chat_model).max_tokens(),
|
|
1359
|
+
'search_limit' : search_limit or 30,
|
|
1360
|
+
'temperature' : persona_temp or 1.0,
|
|
1361
|
+
'top_p' : 0.9,
|
|
1362
|
+
'archive_days' : archive_days if archive_days is not None else 60,
|
|
1363
|
+
'document_processing' : document_processing if document_processing is not None else True,
|
|
1210
1364
|
}
|
|
1211
1365
|
# Set a rounded-down integer to prune a lengthy conversation by 500 tokens
|
|
1212
1366
|
# Note if the upper limit is below 500, the lower limit is set to 0
|
|
@@ -1291,8 +1445,9 @@ class TelegramBot:
|
|
|
1291
1445
|
token_limit = config['token_limit'],
|
|
1292
1446
|
search_limit = config['search_limit'],
|
|
1293
1447
|
persona_temp = config['persona_temp'],
|
|
1294
|
-
archive_days
|
|
1295
|
-
|
|
1448
|
+
archive_days = config['archive_days'],
|
|
1449
|
+
document_processing = config.get('document_processing'),
|
|
1450
|
+
persona_prompt = prompt,
|
|
1296
1451
|
key_status = key_status,
|
|
1297
1452
|
instance_name = config['instance_name'],
|
|
1298
1453
|
webhook_schemas = webhook_schemas,
|
|
@@ -113,6 +113,7 @@ INIT_BOT_CONFIG = {
|
|
|
113
113
|
'persona_temp': None,
|
|
114
114
|
'archive_days': None,
|
|
115
115
|
'allow_local_webhooks': None,
|
|
116
|
+
'document_processing': None,
|
|
116
117
|
'persona_prompt': 'You are a generic test bot powered by a user-configured LLM.'
|
|
117
118
|
}
|
|
118
119
|
|
|
@@ -123,6 +124,7 @@ INIT_BOT_CONFIG_COMMENTS = {
|
|
|
123
124
|
'persona_temp': '# Optional, LLM temperature 0.0-2.0 (default: model\'s default)',
|
|
124
125
|
'archive_days': '# Optional, days before messages are eligible for Tier 1 archival (default: 60, min: 1). Tier 2 triggers at 2x this value.',
|
|
125
126
|
'allow_local_webhooks': '# Optional, set to true to permit webhook/MCP URLs targeting loopback or link-local addresses (default: false)',
|
|
127
|
+
'document_processing': '# Optional, set to false to disable document summarisation (default: true)',
|
|
126
128
|
}
|
|
127
129
|
|
|
128
130
|
# Append the framework-owned system appendix to the persona prompt.
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# Handles incoming messages and URLs unique for TeLLMgramBot
|
|
2
|
+
import io
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from charset_normalizer import from_bytes as _cn_from_bytes
|
|
9
|
+
import defusedxml.ElementTree as _defusedxml_ET
|
|
10
|
+
import pypdf
|
|
11
|
+
|
|
12
|
+
from .utils import log_error
|
|
13
|
+
from .models import TokenLimits
|
|
14
|
+
from .web_utils import (
|
|
15
|
+
fetch_url,
|
|
16
|
+
strip_html_markup,
|
|
17
|
+
InvalidURLException,
|
|
18
|
+
InsecureURLException,
|
|
19
|
+
SusURLException,
|
|
20
|
+
)
|
|
21
|
+
from .providers.factory import get_provider
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
_URL_ANALYSIS_TEMPLATE = (
|
|
26
|
+
"## URL Analysis\n"
|
|
27
|
+
"The user has provided a URL to perform some level of analysis. You will infer "
|
|
28
|
+
"the nature of the analysis from the user's query.\n\n"
|
|
29
|
+
"The contents of the URL mentioned have already been harvested and cleansed. "
|
|
30
|
+
"Note the URL contents will likely have sections of text that are less relevant "
|
|
31
|
+
"to the user's question (headers, footers, menus, ads, etc.). You will need to "
|
|
32
|
+
"ignore those sections of text and focus on the main content of the page.\n\n"
|
|
33
|
+
"The contents of the URL are shown below:\n"
|
|
34
|
+
"BEGIN URL CONTENTS\n"
|
|
35
|
+
"{content}\n"
|
|
36
|
+
"END URL CONTENTS\n"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
_DOCUMENT_ANALYSIS_TEMPLATE = (
|
|
40
|
+
"## Document Analysis\n"
|
|
41
|
+
"The user has shared a document for analysis. Infer the nature of the analysis "
|
|
42
|
+
"from the user's caption or question. If no specific question is provided, "
|
|
43
|
+
"summarise the document's main content and key points.\n\n"
|
|
44
|
+
"The document contents are shown below:\n"
|
|
45
|
+
"BEGIN DOCUMENT CONTENTS\n"
|
|
46
|
+
"{content}\n"
|
|
47
|
+
"END DOCUMENT CONTENTS\n"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
_PLAIN_TEXT_EXTENSIONS = frozenset({
|
|
51
|
+
'.txt', '.md', '.rst', '.csv', '.tsv', '.json', '.jsonl', '.xml',
|
|
52
|
+
'.html', '.htm', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
|
|
53
|
+
'.log', '.py', '.js', '.ts', '.sh', '.bash', '.rb', '.go', '.rs',
|
|
54
|
+
'.java', '.c', '.cpp', '.h', '.cs', '.php', '.sql', '.r', '.tex',
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
_HTML_MIMES = frozenset({'text/html', 'application/xhtml+xml'})
|
|
58
|
+
_XML_MIMES = frozenset({'text/xml', 'application/xml'})
|
|
59
|
+
_PDF_PAGE_CAP = 100
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def handle_greetings(text: str) -> Optional[str]:
|
|
63
|
+
"""
|
|
64
|
+
Respond quickly with single-word greetings like these examples:
|
|
65
|
+
- ' hello ' -> 'Hello!'
|
|
66
|
+
- 'Hey...?' -> 'Hey!'
|
|
67
|
+
- 'SUP?!?!' -> 'Sup!'
|
|
68
|
+
"""
|
|
69
|
+
greetings = {'Hello', 'Hi', 'Hey', 'Heya', 'Sup', 'Yo'}
|
|
70
|
+
word = re.sub(r'[^\w]', '', text.title().strip())
|
|
71
|
+
if word in greetings:
|
|
72
|
+
return f"{word}!"
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def handle_common_queries(text: str) -> Optional[str]:
|
|
77
|
+
"""
|
|
78
|
+
Send messages for assistant bot to respond quickly with some example phrases:
|
|
79
|
+
- ' How you doing ' -> 'How YOU doin?'
|
|
80
|
+
- 'What's up!' -> 'Wassup?'
|
|
81
|
+
"""
|
|
82
|
+
phrase = re.sub(r'[^\w]', '', text.lower().strip())
|
|
83
|
+
if phrase.startswith('howyoudoin'):
|
|
84
|
+
return 'How YOU doin?'
|
|
85
|
+
elif phrase == 'wassup' or phrase == 'whatup' or phrase == 'whatsup':
|
|
86
|
+
return 'Wassup?'
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _decode_bytes(raw: bytes) -> tuple:
|
|
91
|
+
"""
|
|
92
|
+
Decode raw bytes to a string via UTF-8 -> charset-normalizer -> Latin-1 chain.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Tuple of (decoded_text, encoding). encoding is '' for UTF-8 (no annotation needed).
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
return raw.decode('utf-8'), ''
|
|
99
|
+
except UnicodeDecodeError:
|
|
100
|
+
pass
|
|
101
|
+
result = _cn_from_bytes(raw).best()
|
|
102
|
+
if result is not None:
|
|
103
|
+
return str(result), result.encoding
|
|
104
|
+
return raw.decode('latin-1'), 'ISO-8859-1'
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) -> tuple:
|
|
108
|
+
"""
|
|
109
|
+
Extract plain text from document bytes, routing by MIME type and file extension.
|
|
110
|
+
|
|
111
|
+
PDF text is extracted via pypdf (capped at _PDF_PAGE_CAP pages, strict=False).
|
|
112
|
+
HTML content has tags stripped via strip_html_markup. XML is safely parsed via
|
|
113
|
+
defusedxml to extract text nodes without XXE risk; falls back to plain-text
|
|
114
|
+
decode if the XML is malformed. All other plain-text types are decoded using
|
|
115
|
+
a UTF-8 -> charset-normalizer -> Latin-1 chain; non-UTF-8 files prepend a
|
|
116
|
+
[File encoding: ...] annotation so the LLM has context.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
file_bytes: Raw document bytes downloaded from Telegram.
|
|
120
|
+
mime_type: MIME type reported by Telegram (may be empty string).
|
|
121
|
+
filename: Original filename used for extension-based routing (may be empty).
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Tuple of (text, error). On success text is the extracted content and error
|
|
125
|
+
is None. On failure text is None and error is the user-facing response string.
|
|
126
|
+
"""
|
|
127
|
+
mime = (mime_type or '').lower()
|
|
128
|
+
ext = Path(filename).suffix.lower() if filename else ''
|
|
129
|
+
|
|
130
|
+
is_pdf = mime == 'application/pdf' or ext == '.pdf'
|
|
131
|
+
is_html = ext in ('.html', '.htm') or mime in _HTML_MIMES
|
|
132
|
+
is_xml = ext == '.xml' or mime in _XML_MIMES
|
|
133
|
+
is_plain = mime.startswith('text/') or ext in _PLAIN_TEXT_EXTENSIONS
|
|
134
|
+
|
|
135
|
+
if is_pdf:
|
|
136
|
+
try:
|
|
137
|
+
reader = pypdf.PdfReader(io.BytesIO(file_bytes), strict=False)
|
|
138
|
+
text = '\n'.join(
|
|
139
|
+
page.extract_text() or '' for page in reader.pages[:_PDF_PAGE_CAP]
|
|
140
|
+
)
|
|
141
|
+
if not text.strip():
|
|
142
|
+
return None, "This PDF appears to be image-only; I can't read the text in it."
|
|
143
|
+
return text, None
|
|
144
|
+
except Exception as e:
|
|
145
|
+
log_error(e, 'PDF')
|
|
146
|
+
return None, "Something went wrong while reading that PDF. Please try again."
|
|
147
|
+
|
|
148
|
+
if is_html:
|
|
149
|
+
raw_text, _ = _decode_bytes(file_bytes)
|
|
150
|
+
return strip_html_markup(raw_text), None
|
|
151
|
+
|
|
152
|
+
if is_xml:
|
|
153
|
+
try:
|
|
154
|
+
root = _defusedxml_ET.fromstring(file_bytes)
|
|
155
|
+
xml_text = ' '.join(root.itertext()).strip()
|
|
156
|
+
return xml_text or _decode_bytes(file_bytes)[0], None
|
|
157
|
+
except Exception:
|
|
158
|
+
text, encoding = _decode_bytes(file_bytes)
|
|
159
|
+
if encoding:
|
|
160
|
+
text = f"[File encoding: {encoding}]\n{text}"
|
|
161
|
+
return text, None
|
|
162
|
+
|
|
163
|
+
if is_plain:
|
|
164
|
+
text, encoding = _decode_bytes(file_bytes)
|
|
165
|
+
if encoding:
|
|
166
|
+
text = f"[File encoding: {encoding}]\n{text}"
|
|
167
|
+
return text, None
|
|
168
|
+
|
|
169
|
+
return None, "I can only read plain text and PDF files right now."
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def summarise_text(
|
|
173
|
+
content: str,
|
|
174
|
+
question: str,
|
|
175
|
+
model: str,
|
|
176
|
+
template: str,
|
|
177
|
+
prompt: str = '',
|
|
178
|
+
) -> str:
|
|
179
|
+
"""
|
|
180
|
+
Token-prune content, apply template, and complete via the LLM.
|
|
181
|
+
|
|
182
|
+
Prunes content so the fully composed system message (prompt + template with content
|
|
183
|
+
substituted) fits within the model's token budget (max_tokens - 500), then calls the LLM.
|
|
184
|
+
Token counting is measured against the composed message at every pruning step, not just
|
|
185
|
+
the raw content, so the budget guarantee matches what is actually sent to the provider -
|
|
186
|
+
a large template or persona prompt is accounted for, not just the content itself. The
|
|
187
|
+
template must contain a {content} placeholder.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
content: Text content to summarise (URL body or document text).
|
|
191
|
+
question: The user's message or caption; used as the LLM user turn.
|
|
192
|
+
model: LLM model name.
|
|
193
|
+
template: System prompt template with a {content} placeholder.
|
|
194
|
+
prompt: Bot persona prompt prepended to the composed system message.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
LLM response string. Appends a truncation note when content was pruned.
|
|
198
|
+
Returns a user-friendly error string on LLM failure.
|
|
199
|
+
"""
|
|
200
|
+
def _compose(c: str) -> str:
|
|
201
|
+
system = template.replace('{content}', c)
|
|
202
|
+
return f"{prompt}\n\n{system}" if prompt else system
|
|
203
|
+
|
|
204
|
+
working_content = content
|
|
205
|
+
messages = [
|
|
206
|
+
{"role": "system", "content": _compose(working_content)},
|
|
207
|
+
{"role": "user", "content": question},
|
|
208
|
+
]
|
|
209
|
+
lengthy = False
|
|
210
|
+
pruned_tail = ''
|
|
211
|
+
try:
|
|
212
|
+
token_model = TokenLimits(model)
|
|
213
|
+
token_count = await token_model.num_tokens_from_messages(messages)
|
|
214
|
+
token_limit = token_model.max_tokens() - 500
|
|
215
|
+
if token_count > token_limit:
|
|
216
|
+
lengthy = True
|
|
217
|
+
while token_count > token_limit and working_content:
|
|
218
|
+
head, _, _ = working_content.rpartition(' ')
|
|
219
|
+
# No space left to split on (minified JSON, base64, one long token) - halve
|
|
220
|
+
# the string instead so each iteration still guarantees progress toward 0.
|
|
221
|
+
working_content = head if head else working_content[:len(working_content) // 2]
|
|
222
|
+
# Re-measure the full composed system message (template + prompt), not just
|
|
223
|
+
# the raw content, so the token budget matches what is actually sent.
|
|
224
|
+
messages[0]["content"] = _compose(working_content)
|
|
225
|
+
token_count = await token_model.num_tokens_from_messages(messages)
|
|
226
|
+
pruned_tail = working_content[-50:]
|
|
227
|
+
response = await get_provider(model).complete(model, messages)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
log_error(e, model)
|
|
230
|
+
return "Something went wrong while processing the content. Please try again later."
|
|
231
|
+
if lengthy:
|
|
232
|
+
response += (
|
|
233
|
+
"\n\n*NOTE*: The content was too long and needed to be pruned for my summary."
|
|
234
|
+
f" If the text after \"{pruned_tail}\" is crucial, insert the rest for me."
|
|
235
|
+
)
|
|
236
|
+
return response
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
async def handle_url_ask(text: str, model: str = 'gpt-4o', prompt: str = '') -> Optional[str]:
|
|
240
|
+
"""
|
|
241
|
+
Process URL content in an LLM to provide a summary.
|
|
242
|
+
|
|
243
|
+
Extracts URLs wrapped in square brackets [], validates them, checks for safety via VirusTotal,
|
|
244
|
+
fetches content, and summarizes via an LLM specified by model name. The bot's persona prompt
|
|
245
|
+
is prepended to the URL analysis system message so responses match the bot's personality.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
text: The message text potentially containing a URL in [square brackets].
|
|
249
|
+
model: The LLM model to use for URL summarization (default: 'gpt-4o').
|
|
250
|
+
prompt: Bot persona prompt prepended to the URL analysis system message.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
A summary string if a URL was found and processed successfully, an error
|
|
254
|
+
message string if processing failed, or None if no URL detected in text.
|
|
255
|
+
"""
|
|
256
|
+
url_match = re.search(r'\[http(s)?://\S+]', text.strip())
|
|
257
|
+
if url_match:
|
|
258
|
+
url = url_match.group()[1:-1]
|
|
259
|
+
try:
|
|
260
|
+
url_content = strip_html_markup(await fetch_url(url))
|
|
261
|
+
return await summarise_text(url_content, text, model, _URL_ANALYSIS_TEMPLATE, prompt)
|
|
262
|
+
except InvalidURLException as e:
|
|
263
|
+
log_error(e, 'URL')
|
|
264
|
+
return "The URL you provided appears to be invalid. Could you please check it and try again?"
|
|
265
|
+
except InsecureURLException:
|
|
266
|
+
return (
|
|
267
|
+
"The URL you provided is not secure. Could you please try another URL, "
|
|
268
|
+
"or just pasting the relevant content here?"
|
|
269
|
+
)
|
|
270
|
+
except SusURLException:
|
|
271
|
+
return (
|
|
272
|
+
"The URL you provided is potentially unsafe, based on my internal scans. "
|
|
273
|
+
"You can check the safety of URLS using this site: "
|
|
274
|
+
"https://www.virustotal.com/gui/home/url"
|
|
275
|
+
)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
log_error(e, 'URL')
|
|
278
|
+
return f"Something went wrong while fetching the URL: {e}"
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
async def handle_document_message(
|
|
283
|
+
file_bytes: bytes,
|
|
284
|
+
mime_type: str,
|
|
285
|
+
filename: str,
|
|
286
|
+
caption: str,
|
|
287
|
+
model: str,
|
|
288
|
+
prompt: str = '',
|
|
289
|
+
) -> str:
|
|
290
|
+
"""
|
|
291
|
+
Extract text from a document and summarise it via the LLM.
|
|
292
|
+
|
|
293
|
+
Routes by MIME type and file extension via _extract_document_text(), then
|
|
294
|
+
feeds the extracted text through summarise_text() with the document template.
|
|
295
|
+
Logs only the filename, MIME type, and file size - never file content.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
file_bytes: Raw document bytes downloaded from Telegram.
|
|
299
|
+
mime_type: MIME type reported by Telegram (may be empty string).
|
|
300
|
+
filename: Original filename for extension-based routing.
|
|
301
|
+
caption: User's caption or question; used as the LLM user turn.
|
|
302
|
+
model: LLM model to use for summarisation.
|
|
303
|
+
prompt: Bot persona prompt prepended to the system message.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
LLM response string, or a user-facing error message string.
|
|
307
|
+
"""
|
|
308
|
+
logger.info(
|
|
309
|
+
"Document: name=%s mime=%s size=%d",
|
|
310
|
+
filename or 'unknown', mime_type or 'unknown', len(file_bytes),
|
|
311
|
+
)
|
|
312
|
+
text, error = _extract_document_text(file_bytes, mime_type, filename)
|
|
313
|
+
if error:
|
|
314
|
+
return error
|
|
315
|
+
question = caption or 'Please summarise this document.'
|
|
316
|
+
return await summarise_text(text, question, model, _DOCUMENT_ANALYSIS_TEMPLATE, prompt)
|
|
@@ -330,7 +330,7 @@ async def discover_mcp_tools(
|
|
|
330
330
|
|
|
331
331
|
raw_headers = entry.get('headers') or {}
|
|
332
332
|
if not isinstance(raw_headers, dict):
|
|
333
|
-
logger.warning(f"MCP server '{
|
|
333
|
+
logger.warning(f"MCP server '{log_url}': 'headers' must be a dict; treating as empty.")
|
|
334
334
|
raw_headers = {}
|
|
335
335
|
expanded_headers = {}
|
|
336
336
|
disabled = False
|
|
@@ -431,7 +431,7 @@ async def discover_mcp_tools(
|
|
|
431
431
|
all_registered.add(tool_name)
|
|
432
432
|
server_count += 1
|
|
433
433
|
|
|
434
|
-
logger.info(f"MCP server '{
|
|
434
|
+
logger.info(f"MCP server '{log_url}': registered {server_count} tool(s).")
|
|
435
435
|
|
|
436
436
|
return schemas, defs
|
|
437
437
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: TeLLMgramBot
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.15.0
|
|
4
4
|
Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
|
|
5
5
|
Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
|
|
6
6
|
Author: Digital Heresy
|
|
@@ -19,6 +19,9 @@ Requires-Dist: tiktoken>=0.12
|
|
|
19
19
|
Requires-Dist: python-telegram-bot>=20.8
|
|
20
20
|
Requires-Dist: aiosqlite>=0.19
|
|
21
21
|
Requires-Dist: tzdata>=2025.2
|
|
22
|
+
Requires-Dist: pypdf>=6.0
|
|
23
|
+
Requires-Dist: defusedxml>=0.7
|
|
24
|
+
Requires-Dist: charset-normalizer>=3.0
|
|
22
25
|
Dynamic: author
|
|
23
26
|
Dynamic: author-email
|
|
24
27
|
Dynamic: description
|
|
@@ -41,6 +44,10 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
|
|
|
41
44
|
* Pass URLs in [square brackets] and mention how the bot should interpret them.
|
|
42
45
|
* Example: "What do you think of this article? [https://some_site/article]"
|
|
43
46
|
* Uses a separate model (configurable via `url_model`) to handle larger URL content.
|
|
47
|
+
* Share documents and text files for analysis and summarisation.
|
|
48
|
+
* Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
|
|
49
|
+
* The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
|
|
50
|
+
* Can be disabled via `document_processing: false` in config.
|
|
44
51
|
* Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
|
|
45
52
|
* Example: "Who said thanks for the breakdown?" or "What did George say about the project?" or "Show me the last few messages."
|
|
46
53
|
* All search filters (speaker, chat, date) are optional. Results are ordered most-recent-first. Configure `search_limit` to control how many results to return (default: 30).
|
|
@@ -157,6 +164,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
|
|
|
157
164
|
- `token_limit`: Max tokens (optional; defaults to model's maximum)
|
|
158
165
|
- `search_limit`: Max search results (optional; defaults to 30)
|
|
159
166
|
- `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
|
|
167
|
+
- `document_processing`: Optional bool (default: true). Set to false to disable document and text file summarisation.
|
|
160
168
|
- `allow_local_webhooks`: Set to `true` to permit webhook/MCP URLs targeting loopback or link-local addresses (optional; default `false`). Useful when tools like Home Assistant run on the same host.
|
|
161
169
|
- `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
|
|
162
170
|
4. **Disable group privacy mode in BotFather:**
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name='TeLLMgramBot',
|
|
8
|
-
version='3.
|
|
8
|
+
version='3.15.0',
|
|
9
9
|
packages=find_packages(),
|
|
10
10
|
license='MIT',
|
|
11
11
|
author='Digital Heresy',
|
|
@@ -24,7 +24,10 @@ setup(
|
|
|
24
24
|
'tiktoken>=0.12',
|
|
25
25
|
'python-telegram-bot>=20.8',
|
|
26
26
|
'aiosqlite>=0.19',
|
|
27
|
-
'tzdata>=2025.2'
|
|
27
|
+
'tzdata>=2025.2',
|
|
28
|
+
'pypdf>=6.0',
|
|
29
|
+
'defusedxml>=0.7',
|
|
30
|
+
'charset-normalizer>=3.0',
|
|
28
31
|
],
|
|
29
32
|
python_requires='>=3.10'
|
|
30
33
|
)
|
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
# Handles incoming messages and URLs unique for TeLLMgramBot
|
|
2
|
-
import re
|
|
3
|
-
from typing import Optional
|
|
4
|
-
import validators
|
|
5
|
-
|
|
6
|
-
from .utils import log_error
|
|
7
|
-
from .models import TokenLimits
|
|
8
|
-
from .web_utils import (
|
|
9
|
-
fetch_url,
|
|
10
|
-
strip_html_markup,
|
|
11
|
-
InvalidURLException,
|
|
12
|
-
InsecureURLException,
|
|
13
|
-
SusURLException,
|
|
14
|
-
)
|
|
15
|
-
from .providers.factory import get_provider
|
|
16
|
-
|
|
17
|
-
_URL_ANALYSIS_TEMPLATE = (
|
|
18
|
-
"## URL Analysis\n"
|
|
19
|
-
"The user has provided a URL to perform some level of analysis. You will infer "
|
|
20
|
-
"the nature of the analysis from the user's query.\n\n"
|
|
21
|
-
"The contents of the URL mentioned have already been harvested and cleansed. "
|
|
22
|
-
"Note the URL contents will likely have sections of text that are less relevant "
|
|
23
|
-
"to the user's question (headers, footers, menus, ads, etc.). You will need to "
|
|
24
|
-
"ignore those sections of text and focus on the main content of the page.\n\n"
|
|
25
|
-
"The contents of the URL are shown below:\n"
|
|
26
|
-
"BEGIN URL CONTENTS\n"
|
|
27
|
-
"{url_content}\n"
|
|
28
|
-
"END URL CONTENTS\n"
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def handle_greetings(text: str) -> Optional[str]:
|
|
33
|
-
"""
|
|
34
|
-
Respond quickly with single-word greetings like these examples:
|
|
35
|
-
- ' hello ' -> 'Hello!'
|
|
36
|
-
- 'Hey...?' -> 'Hey!'
|
|
37
|
-
- 'SUP?!?!' -> 'Sup!'
|
|
38
|
-
"""
|
|
39
|
-
greetings = {'Hello', 'Hi', 'Hey', 'Heya', 'Sup', 'Yo'}
|
|
40
|
-
word = re.sub(r'[^\w]', '', text.title().strip())
|
|
41
|
-
if word in greetings:
|
|
42
|
-
return f"{word}!"
|
|
43
|
-
return None
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def handle_common_queries(text: str) -> Optional[str]:
|
|
47
|
-
"""
|
|
48
|
-
Send messages for assistant bot to respond quickly with some example phrases:
|
|
49
|
-
- ' How you doing ' -> 'How YOU doin?'
|
|
50
|
-
- 'What's up!' -> 'Wassup?'
|
|
51
|
-
"""
|
|
52
|
-
phrase = re.sub(r'[^\w]', '', text.lower().strip())
|
|
53
|
-
if phrase.startswith('howyoudoin'):
|
|
54
|
-
return 'How YOU doin?'
|
|
55
|
-
elif phrase == 'wassup' or phrase == 'whatup' or phrase == 'whatsup':
|
|
56
|
-
return 'Wassup?'
|
|
57
|
-
return None
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
async def handle_url_ask(text: str, model: str = 'gpt-4o', prompt: str = '') -> Optional[str]:
|
|
61
|
-
"""
|
|
62
|
-
Process URL content in an LLM to provide a summary.
|
|
63
|
-
|
|
64
|
-
Extracts URLs wrapped in square brackets [], validates them, checks for
|
|
65
|
-
safety via VirusTotal, fetches content, and summarizes via an LLM specified
|
|
66
|
-
by model name. The bot's persona prompt is prepended to the URL analysis
|
|
67
|
-
system message so responses match the bot's personality.
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
text: The message text potentially containing a URL in [square brackets].
|
|
71
|
-
model: The LLM model to use for URL summarization (default: 'gpt-4o').
|
|
72
|
-
prompt: Bot persona prompt prepended to the URL analysis system message.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
A summary string if a URL was found and processed successfully, an error
|
|
76
|
-
message string if processing failed, or None if no URL detected in text.
|
|
77
|
-
|
|
78
|
-
Raises:
|
|
79
|
-
No exceptions are raised; all errors are caught and logged, returning
|
|
80
|
-
user-friendly error messages instead.
|
|
81
|
-
"""
|
|
82
|
-
url_match = re.search(r'\[http(s)?://\S+]', text.strip())
|
|
83
|
-
if url_match:
|
|
84
|
-
# Extract the URL from the message, but not the square brackets
|
|
85
|
-
url = url_match.group()[1:-1]
|
|
86
|
-
|
|
87
|
-
# Fetch the URL content
|
|
88
|
-
try:
|
|
89
|
-
# The function strips the HTML markup and ensures the URL is valid and safe
|
|
90
|
-
url_content = strip_html_markup(await fetch_url(url))
|
|
91
|
-
|
|
92
|
-
# Check if the URL is valid real quick
|
|
93
|
-
if not validators.url(url):
|
|
94
|
-
raise InvalidURLException(f"Invalid URL parsed by message_handlers.handle_url_ask(): {url}")
|
|
95
|
-
|
|
96
|
-
# Build messages:
|
|
97
|
-
# 1. URL content to be added into the system prompt template
|
|
98
|
-
# 2. User message requesting URL in [square brackets]
|
|
99
|
-
messages = [
|
|
100
|
-
{"role": "system", "content": url_content},
|
|
101
|
-
{"role": "user", "content": text}
|
|
102
|
-
]
|
|
103
|
-
|
|
104
|
-
# Consider the maximum amount of tokens a LLM can support.
|
|
105
|
-
# If the URL content is too big, we need to prune it down to a reasonable size.
|
|
106
|
-
# Let's also reserve 500 tokens for prompt and response.
|
|
107
|
-
lengthy_url = False
|
|
108
|
-
pruned_tail = ''
|
|
109
|
-
token_model = TokenLimits(model)
|
|
110
|
-
token_count = await token_model.num_tokens_from_messages(messages)
|
|
111
|
-
token_limit = token_model.max_tokens() - 500
|
|
112
|
-
if token_count > token_limit:
|
|
113
|
-
lengthy_url = True
|
|
114
|
-
while token_count > token_limit:
|
|
115
|
-
# Remove every last word until the token limit is satisfied
|
|
116
|
-
messages[0]["content"] = messages[0]["content"].rsplit(' ', 1)[0]
|
|
117
|
-
token_count = await token_model.num_tokens_from_messages(messages)
|
|
118
|
-
# Show the last 50 characters of the pruned URL content
|
|
119
|
-
pruned_tail = messages[0]["content"][-50:]
|
|
120
|
-
|
|
121
|
-
# Build system message: bot persona (if any) + URL analysis template with content
|
|
122
|
-
url_system = _URL_ANALYSIS_TEMPLATE.replace('{url_content}', messages[0]["content"])
|
|
123
|
-
messages[0]["content"] = f"{prompt}\n\n{url_system}" if prompt else url_system
|
|
124
|
-
|
|
125
|
-
# Call the LLM for the response that summarizes URL content
|
|
126
|
-
try:
|
|
127
|
-
response = await get_provider(model).complete(model, messages)
|
|
128
|
-
except Exception as e:
|
|
129
|
-
log_error(e, f"{model} URL")
|
|
130
|
-
return "Something went wrong while fetching the URL. Please try again later."
|
|
131
|
-
|
|
132
|
-
# If the URL content was too long, let the user know
|
|
133
|
-
if lengthy_url:
|
|
134
|
-
response += ("\n\n"
|
|
135
|
-
"*NOTE*: The URL content was too long and needed to be pruned for my summary."
|
|
136
|
-
f" If the text after \"{pruned_tail}\" is crucial, insert the rest for me."
|
|
137
|
-
)
|
|
138
|
-
return response
|
|
139
|
-
|
|
140
|
-
except InvalidURLException as e:
|
|
141
|
-
log_error(e, 'URL')
|
|
142
|
-
return "The URL you provided appears to be invalid. Could you please check it and try again?"
|
|
143
|
-
except InsecureURLException:
|
|
144
|
-
return ("The URL you provided is not secure. Could you please try another URL, or just pasting the "
|
|
145
|
-
"relevant content here?")
|
|
146
|
-
except SusURLException:
|
|
147
|
-
return ("The URL you provided is potentially unsafe, based on my internal scans. You can check the safety "
|
|
148
|
-
"of URLS using this site: https://www.virustotal.com/gui/home/url")
|
|
149
|
-
except Exception as e:
|
|
150
|
-
log_error(e, 'URL')
|
|
151
|
-
return f"Something went wrong while fetching the URL: {e}"
|
|
152
|
-
|
|
153
|
-
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|