TeLLMgramBot 3.14.2__tar.gz → 3.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/PKG-INFO +9 -1
  2. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/README.md +5 -0
  3. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/TeLLMgramBot.py +262 -107
  4. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/initialize.py +2 -0
  5. tellmgrambot-3.15.0/TeLLMgramBot/message_handlers.py +316 -0
  6. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/tools.py +2 -2
  7. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/PKG-INFO +9 -1
  8. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/requires.txt +3 -0
  9. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/setup.py +5 -2
  10. tellmgrambot-3.14.2/TeLLMgramBot/message_handlers.py +0 -153
  11. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/LICENSE +0 -0
  12. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/__init__.py +0 -0
  13. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/archive.py +0 -0
  14. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/conversation.py +0 -0
  15. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/database.py +0 -0
  16. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/models.py +0 -0
  17. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/__init__.py +0 -0
  18. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/anthropic_provider.py +0 -0
  19. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/base.py +0 -0
  20. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/factory.py +0 -0
  21. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/providers/openai_provider.py +0 -0
  22. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/utils.py +0 -0
  23. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot/web_utils.py +0 -0
  24. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/SOURCES.txt +0 -0
  25. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/dependency_links.txt +0 -0
  26. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/TeLLMgramBot.egg-info/top_level.txt +0 -0
  27. {tellmgrambot-3.14.2 → tellmgrambot-3.15.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TeLLMgramBot
3
- Version: 3.14.2
3
+ Version: 3.15.0
4
4
  Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
5
5
  Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
6
6
  Author: Digital Heresy
@@ -19,6 +19,9 @@ Requires-Dist: tiktoken>=0.12
19
19
  Requires-Dist: python-telegram-bot>=20.8
20
20
  Requires-Dist: aiosqlite>=0.19
21
21
  Requires-Dist: tzdata>=2025.2
22
+ Requires-Dist: pypdf>=6.0
23
+ Requires-Dist: defusedxml>=0.7
24
+ Requires-Dist: charset-normalizer>=3.0
22
25
  Dynamic: author
23
26
  Dynamic: author-email
24
27
  Dynamic: description
@@ -41,6 +44,10 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
41
44
  * Pass URLs in [square brackets] and mention how the bot should interpret them.
42
45
  * Example: "What do you think of this article? [https://some_site/article]"
43
46
  * Uses a separate model (configurable via `url_model`) to handle larger URL content.
47
+ * Share documents and text files for analysis and summarisation.
48
+ * Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
49
+ * The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
50
+ * Can be disabled via `document_processing: false` in config.
44
51
  * Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
45
52
  * Example: "Who said thanks for the breakdown?" or "What did George say about the project?" or "Show me the last few messages."
46
53
  * All search filters (speaker, chat, date) are optional. Results are ordered most-recent-first. Configure `search_limit` to control how many results to return (default: 30).
@@ -157,6 +164,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
157
164
  - `token_limit`: Max tokens (optional; defaults to model's maximum)
158
165
  - `search_limit`: Max search results (optional; defaults to 30)
159
166
  - `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
167
+ - `document_processing`: Optional bool (default: true). Set to false to disable document and text file summarisation.
160
168
  - `allow_local_webhooks`: Set to `true` to permit webhook/MCP URLs targeting loopback or link-local addresses (optional; default `false`). Useful when tools like Home Assistant run on the same host.
161
169
  - `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
162
170
  4. **Disable group privacy mode in BotFather:**
@@ -9,6 +9,10 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
9
9
  * Pass URLs in [square brackets] and mention how the bot should interpret them.
10
10
  * Example: "What do you think of this article? [https://some_site/article]"
11
11
  * Uses a separate model (configurable via `url_model`) to handle larger URL content.
12
+ * Share documents and text files for analysis and summarisation.
13
+ * Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
14
+ * The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
15
+ * Can be disabled via `document_processing: false` in config.
12
16
  * Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
13
17
  * Example: "Who said thanks for the breakdown?" or "What did George say about the project?" or "Show me the last few messages."
14
18
  * All search filters (speaker, chat, date) are optional. Results are ordered most-recent-first. Configure `search_limit` to control how many results to return (default: 30).
@@ -125,6 +129,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
125
129
  - `token_limit`: Max tokens (optional; defaults to model's maximum)
126
130
  - `search_limit`: Max search results (optional; defaults to 30)
127
131
  - `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
132
+ - `document_processing`: Optional bool (default: true). Set to false to disable document and text file summarisation.
128
133
  - `allow_local_webhooks`: Set to `true` to permit webhook/MCP URLs targeting loopback or link-local addresses (optional; default `false`). Useful when tools like Home Assistant run on the same host.
129
134
  - `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
130
135
  4. **Disable group privacy mode in BotFather:**
@@ -40,7 +40,7 @@ from .initialize import (
40
40
  bind_log_identity,
41
41
  init_structure,
42
42
  )
43
- from .message_handlers import handle_greetings, handle_common_queries, handle_url_ask
43
+ from .message_handlers import handle_greetings, handle_common_queries, handle_url_ask, handle_document_message
44
44
  from .models import TokenLimits
45
45
  from .tools import build_tool_registry, discover_mcp_tools, execute_mcp, execute_webhook
46
46
  from .providers.factory import get_provider
@@ -50,16 +50,18 @@ from .utils import exact_word_match, log_error
50
50
  logger = logging.getLogger(__name__)
51
51
 
52
52
  # Dialog copy - centralised so tests never hard-code these strings
53
- _MSG_ADMIN_ONLY = "Sorry, I can't do that for you."
54
- _MSG_PROCESS_ERROR = "Sorry, I couldn't process your message! Please contact my creator."
55
- _MSG_TOOL_RESULT_ERROR = "Sorry, I couldn't process the tool result."
56
- _MSG_NOT_YOUR_PROMPT = "Sorry, this prompt is not for you!"
57
- _MSG_WIPE_PROMPT = "ALL of my memories will be lost! Are you sure?"
58
- _MSG_WIPE_COMPLETE = "Wipe complete. I hope you won't regret this..."
59
- _MSG_WIPE_CANCELLED = "Wipe cancelled. Whew, you scared me for a moment!"
60
- _MSG_FORGET_PROMPT = "Do you really want me to forget our memories together?"
61
- _MSG_FORGET_COMPLETE = "Forget complete. Fresh start it is..."
62
- _MSG_FORGET_CANCELLED = "Forget cancelled. Glad you changed your mind!"
53
+ _MSG_ADMIN_ONLY = "Sorry, I can't do that for you."
54
+ _MSG_PROCESS_ERROR = "Sorry, I couldn't process your message! Please contact my creator."
55
+ _MSG_TOOL_RESULT_ERROR = "Sorry, I couldn't process the tool result."
56
+ _MSG_DOC_PROCESSING_OFF = "Sorry, I can't process documents right now."
57
+ _MSG_OFFLINE = "I'd love to chat, but I am offline at the moment!"
58
+ _MSG_NOT_YOUR_PROMPT = "Sorry, this prompt is not for you!"
59
+ _MSG_WIPE_PROMPT = "ALL of my memories will be lost! Are you sure?"
60
+ _MSG_WIPE_COMPLETE = "Wipe complete. I hope you won't regret this..."
61
+ _MSG_WIPE_CANCELLED = "Wipe cancelled. Whew, you scared me for a moment!"
62
+ _MSG_FORGET_PROMPT = "Do you really want me to forget our memories together?"
63
+ _MSG_FORGET_COMPLETE = "Forget complete. Fresh start it is..."
64
+ _MSG_FORGET_CANCELLED = "Forget cancelled. Glad you changed your mind!"
63
65
 
64
66
  _SEARCH_TOOL = {
65
67
  "name": "search_messages",
@@ -409,6 +411,44 @@ class TelegramBot:
409
411
  "from group conversation contexts. Use /private off to disable."
410
412
  )
411
413
 
414
+ async def _get_or_load_conversation(
415
+ self, chat_id: int, chat_type: str, chat_title: str | None, user_id: int,
416
+ ) -> Conversation:
417
+ """
418
+ Get the Conversation for chat_id, creating it if new, and load/refresh the user's context.
419
+
420
+ Creates a new Conversation keyed by chat_id on first use, refreshes its "Current date
421
+ and time" line, then loads the user's cross-chat history on first appearance this
422
+ session (get_past_interaction) or checks for new messages since the last load
423
+ (refresh_user_context).
424
+
425
+ Args:
426
+ chat_id: Telegram chat ID.
427
+ chat_type: 'private', 'group', or 'supergroup'.
428
+ chat_title: Chat title, or None for private chats.
429
+ user_id: Telegram user ID triggering this message.
430
+
431
+ Returns:
432
+ The active Conversation for this chat.
433
+ """
434
+ if chat_id not in self.conversations:
435
+ self.conversations[chat_id] = Conversation(
436
+ chat_id, chat_type, self.llm['prompt'], self.llm['chat_model'], chat_title,
437
+ )
438
+ conv = self.conversations[chat_id]
439
+ conv.update_datetime()
440
+
441
+ token_budget = floor(self.llm['prune_threshold'] / 2)
442
+ if user_id not in conv._context_cursor:
443
+ # First appearance of this user in this session - load their cross-chat history.
444
+ # Pass bot_id so private chats can also load shared group context.
445
+ # If no history exists yet, the cursor stays unset so the next message retries.
446
+ await conv.get_past_interaction(token_budget, user_id, self.telegram['bot_id'])
447
+ else:
448
+ # Already loaded - check for new cross-chat messages since last load.
449
+ await conv.refresh_user_context(user_id, token_budget)
450
+ return conv
451
+
412
452
  async def tele_handle_response(self, text: str, msg: Message) -> tuple[str, int | None]:
413
453
  """
414
454
  Primary function for handling any response including Generative AI, ensuring:
@@ -448,7 +488,7 @@ class TelegramBot:
448
488
  """
449
489
  # Starting ensures we get some kind of user account details for logging
450
490
  if not self._online:
451
- return "I'd love to chat, but I am offline at the moment!", None
491
+ return _MSG_OFFLINE, None
452
492
 
453
493
  # Extract identity and context from the message
454
494
  user_id = msg.from_user.id
@@ -461,31 +501,7 @@ class TelegramBot:
461
501
  identity = f"@{username}" if username else ' '.join(filter(None, [first_name, last_name]))
462
502
  logger.info(f"User {user_id} ({identity}) in {chat_type} Chat {chat_id}{f' ({chat_title})' if chat_title else ''}")
463
503
 
464
- # For a new session, create a Conversation keyed by chat_id
465
- if chat_id not in self.conversations:
466
- self.conversations[chat_id] = Conversation(
467
- chat_id,
468
- chat_type,
469
- self.llm['prompt'],
470
- self.llm['chat_model'],
471
- chat_title,
472
- )
473
-
474
- conv = self.conversations[chat_id]
475
-
476
- # Refresh datetime on every message
477
- conv.update_datetime()
478
-
479
- token_budget = floor(self.llm['prune_threshold'] / 2)
480
-
481
- if user_id not in conv._context_cursor:
482
- # First appearance of this user in this session - load their cross-chat history.
483
- # Pass bot_id so private chats can also load shared group context.
484
- # If no history exists yet, the cursor stays unset so the next message retries.
485
- await conv.get_past_interaction(token_budget, user_id, self.telegram['bot_id'])
486
- else:
487
- # Already loaded - check for new cross-chat messages since last load.
488
- await conv.refresh_user_context(user_id, token_budget)
504
+ conv = await self._get_or_load_conversation(chat_id, chat_type, chat_title, user_id)
489
505
 
490
506
  # Surface the replied-to message into context before adding the triggering message.
491
507
  await self._surface_replied_to_message(msg, conv)
@@ -695,29 +711,32 @@ class TelegramBot:
695
711
  )
696
712
  await prune_bot_messages(msg.chat.id)
697
713
 
698
- def _exclusive_foreign_mention(self, msg: Message) -> str | None:
714
+ def _exclusive_foreign_mention(self, msg: Message, caption: bool = False) -> str | None:
699
715
  """
700
716
  Return the first foreign @mention if all @mention entities are exclusively foreign.
701
717
 
702
718
  Used in the reply-to-bot path: when the user threads Kowi's message but addresses
703
719
  a different account via @mention, return that account's username so the caller can
704
720
  detect a redirect. Returns None when we are also @mentioned (co-mention),
705
- when there are no @mention entities, or when msg.entities is absent.
721
+ when there are no @mention entities, or when the relevant entities list is absent.
706
722
 
707
723
  Args:
708
724
  msg: The incoming Telegram Message object.
725
+ caption: True to read msg.caption_entities (document captions) instead of
726
+ msg.entities (text messages).
709
727
 
710
728
  Returns:
711
729
  The first foreign @username string (with leading @) if all @mentions are
712
730
  foreign, or None if we are co-mentioned or no @mention entities exist.
713
731
  """
714
- if not msg.entities:
732
+ entities = msg.caption_entities if caption else msg.entities
733
+ if not entities:
715
734
  return None
716
735
  our_username = self.telegram['username'].lower()
717
736
  first_foreign = None
718
- for entity in msg.entities:
737
+ for entity in entities:
719
738
  if entity.type == MessageEntity.MENTION:
720
- mentioned = msg.parse_entity(entity)
739
+ mentioned = msg.parse_caption_entity(entity) if caption else msg.parse_entity(entity)
721
740
  if mentioned.lstrip('@').lower() == our_username:
722
741
  return None
723
742
  if first_foreign is None:
@@ -747,6 +766,90 @@ class TelegramBot:
747
766
  except (TelegramError, AttributeError):
748
767
  await msg.reply_text("Got it!")
749
768
 
769
+ async def _resolve_group_trigger(
770
+ self, msg: Message, text: str, context: ContextTypes.DEFAULT_TYPE, is_caption: bool = False,
771
+ ) -> str | None:
772
+ """
773
+ Apply group/supergroup trigger rules shared by text messages and document captions.
774
+
775
+ Checked in this order:
776
+ 1. Exclusive foreign mention in a reply-to-foreign-bot thread - yields unconditionally,
777
+ taking precedence over @username, nickname/initials, and reply-to-bot signals.
778
+ 2. @username mention - strips the mention from text; responds even if a foreign bot
779
+ is also @mentioned.
780
+ 3. Nickname or initials mention - engages unconditionally.
781
+ 4. Reply-to-bot - weaker signal; yields if exclusively addressed to another account
782
+ via @mention.
783
+ 5. None of the above - not triggered.
784
+ Sends a read receipt via _send_read_receipt() before returning text for any triggered path.
785
+
786
+ Args:
787
+ msg: The Telegram Message in the group/supergroup chat.
788
+ text: msg.text or msg.caption - the content to evaluate for trigger words.
789
+ context: The Telegram context, passed through to _send_read_receipt().
790
+ is_caption: True when text is a document caption, so @mention entities are
791
+ read from msg.caption_entities instead of msg.entities.
792
+
793
+ Returns:
794
+ The text to process (username mention stripped if matched), or None if untriggered.
795
+ """
796
+ is_reply_to_bot = (
797
+ msg.reply_to_message is not None and
798
+ msg.reply_to_message.from_user is not None and
799
+ msg.reply_to_message.from_user.id == self.telegram['bot_id']
800
+ )
801
+ is_reply_to_foreign_bot = (
802
+ msg.reply_to_message is not None and
803
+ msg.reply_to_message.from_user is not None and
804
+ msg.reply_to_message.from_user.is_bot and
805
+ msg.reply_to_message.from_user.id != self.telegram['bot_id']
806
+ )
807
+ if is_reply_to_foreign_bot and self._exclusive_foreign_mention(msg, is_caption):
808
+ return None
809
+ if exact_word_match(self.telegram['username'], text):
810
+ pattern = r'@?\b' + re.escape(self.telegram['username']) + r'\b'
811
+ text = re.sub(pattern, '', text).strip()
812
+ elif (
813
+ exact_word_match(self.telegram['nickname'], text) or
814
+ exact_word_match(self.telegram['initials'], text)
815
+ ):
816
+ pass
817
+ elif is_reply_to_bot:
818
+ if self._exclusive_foreign_mention(msg, is_caption):
819
+ return None
820
+ else:
821
+ return None
822
+ await self._send_read_receipt(msg, context)
823
+ return text
824
+
825
+ async def _send_chunked_reply(
826
+ self, msg: Message, text: str, conv: Conversation | None = None, assistant_db_id: int | None = None,
827
+ ) -> None:
828
+ """
829
+ Split text into Telegram-sized chunks and send them sequentially.
830
+
831
+ Tracks each sent chunk's Telegram message ID in conv._loaded_message_ids (if conv is
832
+ given) and persists the last chunk's ID to assistant_db_id via update_message_tg_id
833
+ (if given), so cross-session tier 2 dedup can find the bot's own reply. A small pause
834
+ between sends reduces risk of Telegram rate limiting.
835
+
836
+ Args:
837
+ msg: The Telegram Message to reply to.
838
+ text: The full response text to chunk and send.
839
+ conv: The active Conversation, if any, for in-session message ID tracking.
840
+ assistant_db_id: The DB row id of the stored assistant message, if any.
841
+ """
842
+ chunk_length = MessageLimit.MAX_TEXT_LENGTH - 1
843
+ chunks = [text[i:i+chunk_length] for i in range(0, len(text), chunk_length)]
844
+ for chunk in chunks:
845
+ sent = await msg.reply_text(chunk)
846
+ if sent:
847
+ if conv:
848
+ conv._loaded_message_ids.add(sent.message_id)
849
+ if assistant_db_id:
850
+ await update_message_tg_id(assistant_db_id, sent.message_id)
851
+ await asyncio.sleep(0.12) # small pause to reduce risk of rate limiting
852
+
750
853
  async def tele_handle_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
751
854
  """
752
855
  Route incoming Telegram messages to appropriate handlers based on chat type and trigger conditions.
@@ -804,66 +907,114 @@ class TelegramBot:
804
907
  response = _MSG_PROCESS_ERROR
805
908
  assistant_db_id = None
806
909
  if chat.type == 'supergroup' or chat.type == 'group':
807
- is_reply_to_bot = (
808
- msg.reply_to_message is not None and
809
- msg.reply_to_message.from_user is not None and
810
- msg.reply_to_message.from_user.id == self.telegram['bot_id']
811
- )
812
- is_reply_to_foreign_bot = (
813
- msg.reply_to_message is not None and
814
- msg.reply_to_message.from_user is not None and
815
- msg.reply_to_message.from_user.is_bot and
816
- msg.reply_to_message.from_user.id != self.telegram['bot_id']
817
- )
818
- # In a foreign-bot reply thread, an exclusive @mention of another account
819
- # takes absolute precedence over any nickname/initials match in the text.
820
- if is_reply_to_foreign_bot and self._exclusive_foreign_mention(msg):
821
- return
822
- if exact_word_match(self.telegram['username'], msg.text):
823
- # Explicit @username mention: strongest signal - respond even if another
824
- # bot is also @mentioned (both may be intentionally addressed).
825
- pattern = r'@?\b' + re.escape(self.telegram['username']) + r'\b'
826
- new_text = re.sub(pattern, '', msg.text).strip()
827
- await self._send_read_receipt(msg, context)
828
- response, assistant_db_id = await self.tele_handle_response(new_text, msg)
829
- elif (
830
- exact_word_match(self.telegram['nickname'], msg.text) or
831
- exact_word_match(self.telegram['initials'], msg.text)
832
- ):
833
- # Nickname/initials: always engage - no reliable way to distinguish
834
- # our name as addressee vs topic from text position alone.
835
- await self._send_read_receipt(msg, context)
836
- response, assistant_db_id = await self.tele_handle_response(msg.text, msg)
837
- elif is_reply_to_bot:
838
- # Reply-to-bot: weaker signal - yield silently if the message is
839
- # exclusively addressed to a foreign account via @mention.
840
- if self._exclusive_foreign_mention(msg):
841
- return
842
- await self._send_read_receipt(msg, context)
843
- response, assistant_db_id = await self.tele_handle_response(msg.text, msg)
844
- else:
910
+ triggered_text = await self._resolve_group_trigger(msg, msg.text, context)
911
+ if triggered_text is None:
845
912
  return
913
+ response, assistant_db_id = await self.tele_handle_response(triggered_text, msg)
846
914
  elif chat.type == 'private':
847
915
  response, assistant_db_id = await self.tele_handle_response(msg.text, msg)
848
916
  else:
849
917
  return
850
918
 
851
- # Split into smaller chunks since Telegram messages have a maximum text length (likely 4096)
852
- chunk_length = MessageLimit.MAX_TEXT_LENGTH - 1
853
- chunks = [response[i:i+chunk_length] for i in range(0, len(response), chunk_length)]
919
+ # Persist this chunk's Telegram message ID on the assistant row. Each call overwrites
920
+ # the previous, so only the last chunk's ID is retained for cross-session tier 2 dedup.
921
+ # Tier 1 covers all chunks in-session via _loaded_message_ids.
854
922
  conv = self.conversations.get(msg.chat.id)
855
- for chunk in chunks:
856
- sent = await msg.reply_text(chunk)
857
- if sent:
858
- if conv:
859
- conv._loaded_message_ids.add(sent.message_id)
860
- # Persist this chunk's Telegram message ID on the assistant row.
861
- # Each call overwrites the previous, so only the last chunk's ID is
862
- # retained for cross-session tier 2 dedup. Tier 1 covers all chunks
863
- # in-session via _loaded_message_ids.
864
- if assistant_db_id:
865
- await update_message_tg_id(assistant_db_id, sent.message_id)
866
- await asyncio.sleep(0.12) # small pause to reduce risk of rate limiting
923
+ await self._send_chunked_reply(msg, response, conv, assistant_db_id)
924
+
925
+ async def tele_handle_document(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
926
+ """
927
+ Route Telegram document messages through the document summarisation pipeline.
928
+
929
+ Group trigger conditions (caption @mention, nickname/initials match, or reply-to-bot)
930
+ are resolved via the shared _resolve_group_trigger() also used by tele_handle_message,
931
+ including the exclusive-foreign-mention yield on reply-to-bot threads. Silently ignores
932
+ documents in channels, edited messages, and in groups/supergroups where no trigger
933
+ condition matched. Once triggered, respects the same global online/offline gate as
934
+ tele_handle_response() (set via /start, /stop) - replies with the offline message
935
+ rather than processing while offline. When document_processing is disabled in config,
936
+ replies with a friendly message instead of processing - but only when the message was
937
+ otherwise triggered (private chat, or a matched group trigger); untriggered group
938
+ documents still yield silently regardless of the flag. Files over 20 MB receive a
939
+ friendly error before download.
940
+
941
+ The user message stored in DB is '[Document: filename] caption'; document
942
+ bytes are never persisted. Respects is_private for cross-chat context isolation.
943
+
944
+ Args:
945
+ update: The Telegram Update containing the document message.
946
+ context: The Telegram context for downloading files and sending replies.
947
+ """
948
+ validated = await self.tele_validate(update)
949
+ if not validated:
950
+ return
951
+ (msg, chat, user) = validated
952
+
953
+ caption = msg.caption or ''
954
+
955
+ if chat.type in ('group', 'supergroup'):
956
+ triggered_caption = await self._resolve_group_trigger(msg, caption, context, is_caption=True)
957
+ if triggered_caption is None:
958
+ return
959
+ caption = triggered_caption
960
+ elif chat.type != 'private':
961
+ return
962
+
963
+ if not self._online:
964
+ await msg.reply_text(_MSG_OFFLINE)
965
+ return
966
+
967
+ if not self.llm.get('document_processing', True):
968
+ await msg.reply_text(_MSG_DOC_PROCESSING_OFF)
969
+ return
970
+
971
+ if msg.document.file_size and msg.document.file_size > 20_000_000:
972
+ await msg.reply_text("That file is too large for me to read - please keep it under 20 MB.")
973
+ return
974
+
975
+ chat_id = chat.id
976
+ chat_type = chat.type
977
+ user_id = user.id
978
+ username = user.username
979
+ first_name = user.first_name or ''
980
+ last_name = user.last_name or ''
981
+
982
+ conv = await self._get_or_load_conversation(chat_id, chat_type, chat.title, user_id)
983
+
984
+ user_private_mode = await get_private_mode(user_id)
985
+ is_private = (chat_type == 'private') and user_private_mode
986
+
987
+ filename = msg.document.file_name or 'document'
988
+ user_text = f"[Document: {filename}] {caption}".strip() if caption else f"[Document: {filename}]"
989
+ user_msg_id = await conv.add_user_message(
990
+ user_text, user_id, username, first_name, last_name, is_private, msg.message_id
991
+ )
992
+
993
+ await msg.reply_text("Sure, give me a moment to read that...")
994
+
995
+ doc_file = await context.bot.get_file(msg.document.file_id)
996
+ file_bytes = bytes(await doc_file.download_as_bytearray())
997
+ mime_type = msg.document.mime_type or ''
998
+
999
+ reply = await handle_document_message(
1000
+ file_bytes, mime_type, filename, caption, self.llm['url_model'], conv.system_content
1001
+ )
1002
+
1003
+ assistant_db_id = await conv.add_assistant_message(
1004
+ reply,
1005
+ self.telegram['bot_id'],
1006
+ self.telegram['username'],
1007
+ self.telegram['first_name'],
1008
+ self.telegram['last_name'],
1009
+ is_private,
1010
+ user_msg_id,
1011
+ )
1012
+
1013
+ token_count = await conv.get_message_token_count()
1014
+ if token_count > self.llm['prune_threshold']:
1015
+ await conv.prune_conversation(self.llm['prune_back_to'])
1016
+
1017
+ await self._send_chunked_reply(msg, reply, conv, assistant_db_id)
867
1018
 
868
1019
  async def tele_validate(self, update: Update) -> tuple[Message, Chat, User] | None:
869
1020
  """
@@ -1097,8 +1248,9 @@ class TelegramBot:
1097
1248
  token_limit = INIT_BOT_CONFIG['token_limit'],
1098
1249
  search_limit = INIT_BOT_CONFIG['search_limit'],
1099
1250
  persona_temp = INIT_BOT_CONFIG['persona_temp'],
1100
- archive_days = INIT_BOT_CONFIG['archive_days'],
1101
- persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
1251
+ archive_days = INIT_BOT_CONFIG['archive_days'],
1252
+ document_processing = INIT_BOT_CONFIG['document_processing'],
1253
+ persona_prompt = INIT_BOT_CONFIG['persona_prompt'],
1102
1254
  key_status: ApiKeyStatus | None = None,
1103
1255
  instance_name: str | None = None,
1104
1256
  webhook_schemas: list | None = None,
@@ -1180,6 +1332,7 @@ class TelegramBot:
1180
1332
  self.telegram['app'].add_handler(CommandHandler('private', self.tele_private_command))
1181
1333
  self.telegram['app'].add_handler(MessageHandler(filters.COMMAND, self.tele_unknown_command))
1182
1334
  self.telegram['app'].add_handler(MessageHandler(filters.TEXT & ~filters.UpdateType.EDITED_MESSAGE, self.tele_handle_message))
1335
+ self.telegram['app'].add_handler(MessageHandler(filters.Document.ALL & ~filters.UpdateType.EDITED_MESSAGE, self.tele_handle_document))
1183
1336
  self.telegram['app'].add_error_handler(self.tele_error)
1184
1337
 
1185
1338
  # Validate optional config values before storing; warn and fall back to defaults on bad input
@@ -1199,14 +1352,15 @@ class TelegramBot:
1199
1352
  # Get our LLM spun up with defaults if not defined by user input
1200
1353
  # Tokens as integers measure the length of conversation messages
1201
1354
  self.llm = {
1202
- 'prompt' : persona_prompt,
1203
- 'chat_model' : chat_model,
1204
- 'url_model' : url_model,
1205
- 'token_limit' : token_limit or TokenLimits(chat_model).max_tokens(),
1206
- 'search_limit' : search_limit or 30,
1207
- 'temperature' : persona_temp or 1.0,
1208
- 'top_p' : 0.9,
1209
- 'archive_days' : archive_days if archive_days is not None else 60,
1355
+ 'prompt' : persona_prompt,
1356
+ 'chat_model' : chat_model,
1357
+ 'url_model' : url_model,
1358
+ 'token_limit' : token_limit or TokenLimits(chat_model).max_tokens(),
1359
+ 'search_limit' : search_limit or 30,
1360
+ 'temperature' : persona_temp or 1.0,
1361
+ 'top_p' : 0.9,
1362
+ 'archive_days' : archive_days if archive_days is not None else 60,
1363
+ 'document_processing' : document_processing if document_processing is not None else True,
1210
1364
  }
1211
1365
  # Set a rounded-down integer to prune a lengthy conversation by 500 tokens
1212
1366
  # Note if the upper limit is below 500, the lower limit is set to 0
@@ -1291,8 +1445,9 @@ class TelegramBot:
1291
1445
  token_limit = config['token_limit'],
1292
1446
  search_limit = config['search_limit'],
1293
1447
  persona_temp = config['persona_temp'],
1294
- archive_days = config['archive_days'],
1295
- persona_prompt = prompt,
1448
+ archive_days = config['archive_days'],
1449
+ document_processing = config.get('document_processing'),
1450
+ persona_prompt = prompt,
1296
1451
  key_status = key_status,
1297
1452
  instance_name = config['instance_name'],
1298
1453
  webhook_schemas = webhook_schemas,
@@ -113,6 +113,7 @@ INIT_BOT_CONFIG = {
113
113
  'persona_temp': None,
114
114
  'archive_days': None,
115
115
  'allow_local_webhooks': None,
116
+ 'document_processing': None,
116
117
  'persona_prompt': 'You are a generic test bot powered by a user-configured LLM.'
117
118
  }
118
119
 
@@ -123,6 +124,7 @@ INIT_BOT_CONFIG_COMMENTS = {
123
124
  'persona_temp': '# Optional, LLM temperature 0.0-2.0 (default: model\'s default)',
124
125
  'archive_days': '# Optional, days before messages are eligible for Tier 1 archival (default: 60, min: 1). Tier 2 triggers at 2x this value.',
125
126
  'allow_local_webhooks': '# Optional, set to true to permit webhook/MCP URLs targeting loopback or link-local addresses (default: false)',
127
+ 'document_processing': '# Optional, set to false to disable document summarisation (default: true)',
126
128
  }
127
129
 
128
130
  # Append the framework-owned system appendix to the persona prompt.
@@ -0,0 +1,316 @@
1
+ # Handles incoming messages and URLs unique for TeLLMgramBot
2
+ import io
3
+ import logging
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from charset_normalizer import from_bytes as _cn_from_bytes
9
+ import defusedxml.ElementTree as _defusedxml_ET
10
+ import pypdf
11
+
12
+ from .utils import log_error
13
+ from .models import TokenLimits
14
+ from .web_utils import (
15
+ fetch_url,
16
+ strip_html_markup,
17
+ InvalidURLException,
18
+ InsecureURLException,
19
+ SusURLException,
20
+ )
21
+ from .providers.factory import get_provider
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ _URL_ANALYSIS_TEMPLATE = (
26
+ "## URL Analysis\n"
27
+ "The user has provided a URL to perform some level of analysis. You will infer "
28
+ "the nature of the analysis from the user's query.\n\n"
29
+ "The contents of the URL mentioned have already been harvested and cleansed. "
30
+ "Note the URL contents will likely have sections of text that are less relevant "
31
+ "to the user's question (headers, footers, menus, ads, etc.). You will need to "
32
+ "ignore those sections of text and focus on the main content of the page.\n\n"
33
+ "The contents of the URL are shown below:\n"
34
+ "BEGIN URL CONTENTS\n"
35
+ "{content}\n"
36
+ "END URL CONTENTS\n"
37
+ )
38
+
39
+ _DOCUMENT_ANALYSIS_TEMPLATE = (
40
+ "## Document Analysis\n"
41
+ "The user has shared a document for analysis. Infer the nature of the analysis "
42
+ "from the user's caption or question. If no specific question is provided, "
43
+ "summarise the document's main content and key points.\n\n"
44
+ "The document contents are shown below:\n"
45
+ "BEGIN DOCUMENT CONTENTS\n"
46
+ "{content}\n"
47
+ "END DOCUMENT CONTENTS\n"
48
+ )
49
+
50
+ _PLAIN_TEXT_EXTENSIONS = frozenset({
51
+ '.txt', '.md', '.rst', '.csv', '.tsv', '.json', '.jsonl', '.xml',
52
+ '.html', '.htm', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
53
+ '.log', '.py', '.js', '.ts', '.sh', '.bash', '.rb', '.go', '.rs',
54
+ '.java', '.c', '.cpp', '.h', '.cs', '.php', '.sql', '.r', '.tex',
55
+ })
56
+
57
+ _HTML_MIMES = frozenset({'text/html', 'application/xhtml+xml'})
58
+ _XML_MIMES = frozenset({'text/xml', 'application/xml'})
59
+ _PDF_PAGE_CAP = 100
60
+
61
+
62
+ def handle_greetings(text: str) -> Optional[str]:
63
+ """
64
+ Respond quickly with single-word greetings like these examples:
65
+ - ' hello ' -> 'Hello!'
66
+ - 'Hey...?' -> 'Hey!'
67
+ - 'SUP?!?!' -> 'Sup!'
68
+ """
69
+ greetings = {'Hello', 'Hi', 'Hey', 'Heya', 'Sup', 'Yo'}
70
+ word = re.sub(r'[^\w]', '', text.title().strip())
71
+ if word in greetings:
72
+ return f"{word}!"
73
+ return None
74
+
75
+
76
+ def handle_common_queries(text: str) -> Optional[str]:
77
+ """
78
+ Send messages for assistant bot to respond quickly with some example phrases:
79
+ - ' How you doing ' -> 'How YOU doin?'
80
+ - 'What's up!' -> 'Wassup?'
81
+ """
82
+ phrase = re.sub(r'[^\w]', '', text.lower().strip())
83
+ if phrase.startswith('howyoudoin'):
84
+ return 'How YOU doin?'
85
+ elif phrase == 'wassup' or phrase == 'whatup' or phrase == 'whatsup':
86
+ return 'Wassup?'
87
+ return None
88
+
89
+
90
+ def _decode_bytes(raw: bytes) -> tuple:
91
+ """
92
+ Decode raw bytes to a string via UTF-8 -> charset-normalizer -> Latin-1 chain.
93
+
94
+ Returns:
95
+ Tuple of (decoded_text, encoding). encoding is '' for UTF-8 (no annotation needed).
96
+ """
97
+ try:
98
+ return raw.decode('utf-8'), ''
99
+ except UnicodeDecodeError:
100
+ pass
101
+ result = _cn_from_bytes(raw).best()
102
+ if result is not None:
103
+ return str(result), result.encoding
104
+ return raw.decode('latin-1'), 'ISO-8859-1'
105
+
106
+
107
+ def _extract_document_text(file_bytes: bytes, mime_type: str, filename: str) -> tuple:
108
+ """
109
+ Extract plain text from document bytes, routing by MIME type and file extension.
110
+
111
+ PDF text is extracted via pypdf (capped at _PDF_PAGE_CAP pages, strict=False).
112
+ HTML content has tags stripped via strip_html_markup. XML is safely parsed via
113
+ defusedxml to extract text nodes without XXE risk; falls back to plain-text
114
+ decode if the XML is malformed. All other plain-text types are decoded using
115
+ a UTF-8 -> charset-normalizer -> Latin-1 chain; non-UTF-8 files prepend a
116
+ [File encoding: ...] annotation so the LLM has context.
117
+
118
+ Args:
119
+ file_bytes: Raw document bytes downloaded from Telegram.
120
+ mime_type: MIME type reported by Telegram (may be empty string).
121
+ filename: Original filename used for extension-based routing (may be empty).
122
+
123
+ Returns:
124
+ Tuple of (text, error). On success text is the extracted content and error
125
+ is None. On failure text is None and error is the user-facing response string.
126
+ """
127
+ mime = (mime_type or '').lower()
128
+ ext = Path(filename).suffix.lower() if filename else ''
129
+
130
+ is_pdf = mime == 'application/pdf' or ext == '.pdf'
131
+ is_html = ext in ('.html', '.htm') or mime in _HTML_MIMES
132
+ is_xml = ext == '.xml' or mime in _XML_MIMES
133
+ is_plain = mime.startswith('text/') or ext in _PLAIN_TEXT_EXTENSIONS
134
+
135
+ if is_pdf:
136
+ try:
137
+ reader = pypdf.PdfReader(io.BytesIO(file_bytes), strict=False)
138
+ text = '\n'.join(
139
+ page.extract_text() or '' for page in reader.pages[:_PDF_PAGE_CAP]
140
+ )
141
+ if not text.strip():
142
+ return None, "This PDF appears to be image-only; I can't read the text in it."
143
+ return text, None
144
+ except Exception as e:
145
+ log_error(e, 'PDF')
146
+ return None, "Something went wrong while reading that PDF. Please try again."
147
+
148
+ if is_html:
149
+ raw_text, _ = _decode_bytes(file_bytes)
150
+ return strip_html_markup(raw_text), None
151
+
152
+ if is_xml:
153
+ try:
154
+ root = _defusedxml_ET.fromstring(file_bytes)
155
+ xml_text = ' '.join(root.itertext()).strip()
156
+ return xml_text or _decode_bytes(file_bytes)[0], None
157
+ except Exception:
158
+ text, encoding = _decode_bytes(file_bytes)
159
+ if encoding:
160
+ text = f"[File encoding: {encoding}]\n{text}"
161
+ return text, None
162
+
163
+ if is_plain:
164
+ text, encoding = _decode_bytes(file_bytes)
165
+ if encoding:
166
+ text = f"[File encoding: {encoding}]\n{text}"
167
+ return text, None
168
+
169
+ return None, "I can only read plain text and PDF files right now."
170
+
171
+
172
+ async def summarise_text(
173
+ content: str,
174
+ question: str,
175
+ model: str,
176
+ template: str,
177
+ prompt: str = '',
178
+ ) -> str:
179
+ """
180
+ Token-prune content, apply template, and complete via the LLM.
181
+
182
+ Prunes content so the fully composed system message (prompt + template with content
183
+ substituted) fits within the model's token budget (max_tokens - 500), then calls the LLM.
184
+ Token counting is measured against the composed message at every pruning step, not just
185
+ the raw content, so the budget guarantee matches what is actually sent to the provider -
186
+ a large template or persona prompt is accounted for, not just the content itself. The
187
+ template must contain a {content} placeholder.
188
+
189
+ Args:
190
+ content: Text content to summarise (URL body or document text).
191
+ question: The user's message or caption; used as the LLM user turn.
192
+ model: LLM model name.
193
+ template: System prompt template with a {content} placeholder.
194
+ prompt: Bot persona prompt prepended to the composed system message.
195
+
196
+ Returns:
197
+ LLM response string. Appends a truncation note when content was pruned.
198
+ Returns a user-friendly error string on LLM failure.
199
+ """
200
+ def _compose(c: str) -> str:
201
+ system = template.replace('{content}', c)
202
+ return f"{prompt}\n\n{system}" if prompt else system
203
+
204
+ working_content = content
205
+ messages = [
206
+ {"role": "system", "content": _compose(working_content)},
207
+ {"role": "user", "content": question},
208
+ ]
209
+ lengthy = False
210
+ pruned_tail = ''
211
+ try:
212
+ token_model = TokenLimits(model)
213
+ token_count = await token_model.num_tokens_from_messages(messages)
214
+ token_limit = token_model.max_tokens() - 500
215
+ if token_count > token_limit:
216
+ lengthy = True
217
+ while token_count > token_limit and working_content:
218
+ head, _, _ = working_content.rpartition(' ')
219
+ # No space left to split on (minified JSON, base64, one long token) - halve
220
+ # the string instead so each iteration still guarantees progress toward 0.
221
+ working_content = head if head else working_content[:len(working_content) // 2]
222
+ # Re-measure the full composed system message (template + prompt), not just
223
+ # the raw content, so the token budget matches what is actually sent.
224
+ messages[0]["content"] = _compose(working_content)
225
+ token_count = await token_model.num_tokens_from_messages(messages)
226
+ pruned_tail = working_content[-50:]
227
+ response = await get_provider(model).complete(model, messages)
228
+ except Exception as e:
229
+ log_error(e, model)
230
+ return "Something went wrong while processing the content. Please try again later."
231
+ if lengthy:
232
+ response += (
233
+ "\n\n*NOTE*: The content was too long and needed to be pruned for my summary."
234
+ f" If the text after \"{pruned_tail}\" is crucial, insert the rest for me."
235
+ )
236
+ return response
237
+
238
+
239
+ async def handle_url_ask(text: str, model: str = 'gpt-4o', prompt: str = '') -> Optional[str]:
240
+ """
241
+ Process URL content in an LLM to provide a summary.
242
+
243
+ Extracts URLs wrapped in square brackets [], validates them, checks for safety via VirusTotal,
244
+ fetches content, and summarizes via an LLM specified by model name. The bot's persona prompt
245
+ is prepended to the URL analysis system message so responses match the bot's personality.
246
+
247
+ Args:
248
+ text: The message text potentially containing a URL in [square brackets].
249
+ model: The LLM model to use for URL summarization (default: 'gpt-4o').
250
+ prompt: Bot persona prompt prepended to the URL analysis system message.
251
+
252
+ Returns:
253
+ A summary string if a URL was found and processed successfully, an error
254
+ message string if processing failed, or None if no URL detected in text.
255
+ """
256
+ url_match = re.search(r'\[http(s)?://\S+]', text.strip())
257
+ if url_match:
258
+ url = url_match.group()[1:-1]
259
+ try:
260
+ url_content = strip_html_markup(await fetch_url(url))
261
+ return await summarise_text(url_content, text, model, _URL_ANALYSIS_TEMPLATE, prompt)
262
+ except InvalidURLException as e:
263
+ log_error(e, 'URL')
264
+ return "The URL you provided appears to be invalid. Could you please check it and try again?"
265
+ except InsecureURLException:
266
+ return (
267
+ "The URL you provided is not secure. Could you please try another URL, "
268
+ "or just pasting the relevant content here?"
269
+ )
270
+ except SusURLException:
271
+ return (
272
+ "The URL you provided is potentially unsafe, based on my internal scans. "
273
+ "You can check the safety of URLS using this site: "
274
+ "https://www.virustotal.com/gui/home/url"
275
+ )
276
+ except Exception as e:
277
+ log_error(e, 'URL')
278
+ return f"Something went wrong while fetching the URL: {e}"
279
+ return None
280
+
281
+
282
+ async def handle_document_message(
283
+ file_bytes: bytes,
284
+ mime_type: str,
285
+ filename: str,
286
+ caption: str,
287
+ model: str,
288
+ prompt: str = '',
289
+ ) -> str:
290
+ """
291
+ Extract text from a document and summarise it via the LLM.
292
+
293
+ Routes by MIME type and file extension via _extract_document_text(), then
294
+ feeds the extracted text through summarise_text() with the document template.
295
+ Logs only the filename, MIME type, and file size - never file content.
296
+
297
+ Args:
298
+ file_bytes: Raw document bytes downloaded from Telegram.
299
+ mime_type: MIME type reported by Telegram (may be empty string).
300
+ filename: Original filename for extension-based routing.
301
+ caption: User's caption or question; used as the LLM user turn.
302
+ model: LLM model to use for summarisation.
303
+ prompt: Bot persona prompt prepended to the system message.
304
+
305
+ Returns:
306
+ LLM response string, or a user-facing error message string.
307
+ """
308
+ logger.info(
309
+ "Document: name=%s mime=%s size=%d",
310
+ filename or 'unknown', mime_type or 'unknown', len(file_bytes),
311
+ )
312
+ text, error = _extract_document_text(file_bytes, mime_type, filename)
313
+ if error:
314
+ return error
315
+ question = caption or 'Please summarise this document.'
316
+ return await summarise_text(text, question, model, _DOCUMENT_ANALYSIS_TEMPLATE, prompt)
@@ -330,7 +330,7 @@ async def discover_mcp_tools(
330
330
 
331
331
  raw_headers = entry.get('headers') or {}
332
332
  if not isinstance(raw_headers, dict):
333
- logger.warning(f"MCP server '{server_url}': 'headers' must be a dict; treating as empty.")
333
+ logger.warning(f"MCP server '{log_url}': 'headers' must be a dict; treating as empty.")
334
334
  raw_headers = {}
335
335
  expanded_headers = {}
336
336
  disabled = False
@@ -431,7 +431,7 @@ async def discover_mcp_tools(
431
431
  all_registered.add(tool_name)
432
432
  server_count += 1
433
433
 
434
- logger.info(f"MCP server '{server_url}': registered {server_count} tool(s).")
434
+ logger.info(f"MCP server '{log_url}': registered {server_count} tool(s).")
435
435
 
436
436
  return schemas, defs
437
437
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TeLLMgramBot
3
- Version: 3.14.2
3
+ Version: 3.15.0
4
4
  Summary: LLM-powered Telegram bot (OpenAI + Anthropic)
5
5
  Home-page: https://github.com/Digital-Heresy/TeLLMgramBot
6
6
  Author: Digital Heresy
@@ -19,6 +19,9 @@ Requires-Dist: tiktoken>=0.12
19
19
  Requires-Dist: python-telegram-bot>=20.8
20
20
  Requires-Dist: aiosqlite>=0.19
21
21
  Requires-Dist: tzdata>=2025.2
22
+ Requires-Dist: pypdf>=6.0
23
+ Requires-Dist: defusedxml>=0.7
24
+ Requires-Dist: charset-normalizer>=3.0
22
25
  Dynamic: author
23
26
  Dynamic: author-email
24
27
  Dynamic: description
@@ -41,6 +44,10 @@ The basic goal of this project is to create a bridge between a Telegram Bot and
41
44
  * Pass URLs in [square brackets] and mention how the bot should interpret them.
42
45
  * Example: "What do you think of this article? [https://some_site/article]"
43
46
  * Uses a separate model (configurable via `url_model`) to handle larger URL content.
47
+ * Share documents and text files for analysis and summarisation.
48
+ * Supported formats: PDF, plain-text files (.txt, .md, .rst, .csv, .json, etc.), HTML, and XML.
49
+ * The bot extracts and summarises content, with automatic encoding detection for non-UTF-8 files. Files over 20 MB are rejected.
50
+ * Can be disabled via `document_processing: false` in config.
44
51
  * Ask questions about message history across all your chats using natural language; the bot will search, attribute messages to speakers, and include messages from other bots.
45
52
  * Example: "Who said thanks for the breakdown?" or "What did George say about the project?" or "Show me the last few messages."
46
53
  * All search filters (speaker, chat, date) are optional. Results are ordered most-recent-first. Configure `search_limit` to control how many results to return (default: 30).
@@ -157,6 +164,7 @@ When the bot is triggered in a group and about to respond (not deferring to anot
157
164
  - `token_limit`: Max tokens (optional; defaults to model's maximum)
158
165
  - `search_limit`: Max search results (optional; defaults to 30)
159
166
  - `archive_days`: Days before messages are eligible for archival (optional; default 60, minimum 1). Older messages are distilled into daily summaries, then progressively compressed into monthly digests. Once archived their respective raw messages do not return to the LLM context any more, only when searching messages.
167
+ - `document_processing`: Optional bool (default: true). Set to false to disable document and text file summarisation.
160
168
  - `allow_local_webhooks`: Set to `true` to permit webhook/MCP URLs targeting loopback or link-local addresses (optional; default `false`). Useful when tools like Home Assistant run on the same host.
161
169
  - `tools`: Optional list of webhook and MCP tool definitions (admin-only, private chat only). See [docs/tools.md](docs/tools.md) for schema and examples.
162
170
  4. **Disable group privacy mode in BotFather:**
@@ -8,3 +8,6 @@ tiktoken>=0.12
8
8
  python-telegram-bot>=20.8
9
9
  aiosqlite>=0.19
10
10
  tzdata>=2025.2
11
+ pypdf>=6.0
12
+ defusedxml>=0.7
13
+ charset-normalizer>=3.0
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
5
5
 
6
6
  setup(
7
7
  name='TeLLMgramBot',
8
- version='3.14.2',
8
+ version='3.15.0',
9
9
  packages=find_packages(),
10
10
  license='MIT',
11
11
  author='Digital Heresy',
@@ -24,7 +24,10 @@ setup(
24
24
  'tiktoken>=0.12',
25
25
  'python-telegram-bot>=20.8',
26
26
  'aiosqlite>=0.19',
27
- 'tzdata>=2025.2'
27
+ 'tzdata>=2025.2',
28
+ 'pypdf>=6.0',
29
+ 'defusedxml>=0.7',
30
+ 'charset-normalizer>=3.0',
28
31
  ],
29
32
  python_requires='>=3.10'
30
33
  )
@@ -1,153 +0,0 @@
1
- # Handles incoming messages and URLs unique for TeLLMgramBot
2
- import re
3
- from typing import Optional
4
- import validators
5
-
6
- from .utils import log_error
7
- from .models import TokenLimits
8
- from .web_utils import (
9
- fetch_url,
10
- strip_html_markup,
11
- InvalidURLException,
12
- InsecureURLException,
13
- SusURLException,
14
- )
15
- from .providers.factory import get_provider
16
-
17
- _URL_ANALYSIS_TEMPLATE = (
18
- "## URL Analysis\n"
19
- "The user has provided a URL to perform some level of analysis. You will infer "
20
- "the nature of the analysis from the user's query.\n\n"
21
- "The contents of the URL mentioned have already been harvested and cleansed. "
22
- "Note the URL contents will likely have sections of text that are less relevant "
23
- "to the user's question (headers, footers, menus, ads, etc.). You will need to "
24
- "ignore those sections of text and focus on the main content of the page.\n\n"
25
- "The contents of the URL are shown below:\n"
26
- "BEGIN URL CONTENTS\n"
27
- "{url_content}\n"
28
- "END URL CONTENTS\n"
29
- )
30
-
31
-
32
- def handle_greetings(text: str) -> Optional[str]:
33
- """
34
- Respond quickly with single-word greetings like these examples:
35
- - ' hello ' -> 'Hello!'
36
- - 'Hey...?' -> 'Hey!'
37
- - 'SUP?!?!' -> 'Sup!'
38
- """
39
- greetings = {'Hello', 'Hi', 'Hey', 'Heya', 'Sup', 'Yo'}
40
- word = re.sub(r'[^\w]', '', text.title().strip())
41
- if word in greetings:
42
- return f"{word}!"
43
- return None
44
-
45
-
46
- def handle_common_queries(text: str) -> Optional[str]:
47
- """
48
- Send messages for assistant bot to respond quickly with some example phrases:
49
- - ' How you doing ' -> 'How YOU doin?'
50
- - 'What's up!' -> 'Wassup?'
51
- """
52
- phrase = re.sub(r'[^\w]', '', text.lower().strip())
53
- if phrase.startswith('howyoudoin'):
54
- return 'How YOU doin?'
55
- elif phrase == 'wassup' or phrase == 'whatup' or phrase == 'whatsup':
56
- return 'Wassup?'
57
- return None
58
-
59
-
60
- async def handle_url_ask(text: str, model: str = 'gpt-4o', prompt: str = '') -> Optional[str]:
61
- """
62
- Process URL content in an LLM to provide a summary.
63
-
64
- Extracts URLs wrapped in square brackets [], validates them, checks for
65
- safety via VirusTotal, fetches content, and summarizes via an LLM specified
66
- by model name. The bot's persona prompt is prepended to the URL analysis
67
- system message so responses match the bot's personality.
68
-
69
- Args:
70
- text: The message text potentially containing a URL in [square brackets].
71
- model: The LLM model to use for URL summarization (default: 'gpt-4o').
72
- prompt: Bot persona prompt prepended to the URL analysis system message.
73
-
74
- Returns:
75
- A summary string if a URL was found and processed successfully, an error
76
- message string if processing failed, or None if no URL detected in text.
77
-
78
- Raises:
79
- No exceptions are raised; all errors are caught and logged, returning
80
- user-friendly error messages instead.
81
- """
82
- url_match = re.search(r'\[http(s)?://\S+]', text.strip())
83
- if url_match:
84
- # Extract the URL from the message, but not the square brackets
85
- url = url_match.group()[1:-1]
86
-
87
- # Fetch the URL content
88
- try:
89
- # The function strips the HTML markup and ensures the URL is valid and safe
90
- url_content = strip_html_markup(await fetch_url(url))
91
-
92
- # Check if the URL is valid real quick
93
- if not validators.url(url):
94
- raise InvalidURLException(f"Invalid URL parsed by message_handlers.handle_url_ask(): {url}")
95
-
96
- # Build messages:
97
- # 1. URL content to be added into the system prompt template
98
- # 2. User message requesting URL in [square brackets]
99
- messages = [
100
- {"role": "system", "content": url_content},
101
- {"role": "user", "content": text}
102
- ]
103
-
104
- # Consider the maximum amount of tokens a LLM can support.
105
- # If the URL content is too big, we need to prune it down to a reasonable size.
106
- # Let's also reserve 500 tokens for prompt and response.
107
- lengthy_url = False
108
- pruned_tail = ''
109
- token_model = TokenLimits(model)
110
- token_count = await token_model.num_tokens_from_messages(messages)
111
- token_limit = token_model.max_tokens() - 500
112
- if token_count > token_limit:
113
- lengthy_url = True
114
- while token_count > token_limit:
115
- # Remove every last word until the token limit is satisfied
116
- messages[0]["content"] = messages[0]["content"].rsplit(' ', 1)[0]
117
- token_count = await token_model.num_tokens_from_messages(messages)
118
- # Show the last 50 characters of the pruned URL content
119
- pruned_tail = messages[0]["content"][-50:]
120
-
121
- # Build system message: bot persona (if any) + URL analysis template with content
122
- url_system = _URL_ANALYSIS_TEMPLATE.replace('{url_content}', messages[0]["content"])
123
- messages[0]["content"] = f"{prompt}\n\n{url_system}" if prompt else url_system
124
-
125
- # Call the LLM for the response that summarizes URL content
126
- try:
127
- response = await get_provider(model).complete(model, messages)
128
- except Exception as e:
129
- log_error(e, f"{model} URL")
130
- return "Something went wrong while fetching the URL. Please try again later."
131
-
132
- # If the URL content was too long, let the user know
133
- if lengthy_url:
134
- response += ("\n\n"
135
- "*NOTE*: The URL content was too long and needed to be pruned for my summary."
136
- f" If the text after \"{pruned_tail}\" is crucial, insert the rest for me."
137
- )
138
- return response
139
-
140
- except InvalidURLException as e:
141
- log_error(e, 'URL')
142
- return "The URL you provided appears to be invalid. Could you please check it and try again?"
143
- except InsecureURLException:
144
- return ("The URL you provided is not secure. Could you please try another URL, or just pasting the "
145
- "relevant content here?")
146
- except SusURLException:
147
- return ("The URL you provided is potentially unsafe, based on my internal scans. You can check the safety "
148
- "of URLS using this site: https://www.virustotal.com/gui/home/url")
149
- except Exception as e:
150
- log_error(e, 'URL')
151
- return f"Something went wrong while fetching the URL: {e}"
152
-
153
- return None
File without changes
File without changes