PyPI - langroid - Versions diffs - 0.1.100__py3-none-any.whl → 0.1.102__py3-none-any.whl - Mend

langroid 0.1.100py3-none-any.whl → 0.1.102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

langroid/agent/base.py +9 -8
langroid/agent/batch.py +6 -4
langroid/agent/chat_agent.py +9 -7
langroid/agent/special/doc_chat_agent.py +100 -4
langroid/agent/special/relevance_extractor_agent.py +11 -5
langroid/agent/special/retriever_agent.py +1 -1
langroid/agent/task.py +13 -10
langroid/mytypes.py +10 -4
langroid/parsing/document_parser.py +1 -0
langroid/parsing/parser.py +62 -31
langroid/parsing/search.py +54 -49
langroid/parsing/utils.py +26 -0
langroid/utils/algorithms/graph.py +49 -0
langroid/utils/configuration.py +30 -1
langroid/utils/output/printing.py +31 -1
langroid/utils/pydantic_utils.py +3 -1
langroid/vector_store/base.py +157 -1
langroid/vector_store/chromadb.py +12 -19
langroid/vector_store/meilisearch.py +1 -0
langroid/vector_store/momento.py +1 -0
langroid/vector_store/qdrantdb.py +10 -4
{langroid-0.1.100.dist-info → langroid-0.1.102.dist-info}/METADATA +1 -1
{langroid-0.1.100.dist-info → langroid-0.1.102.dist-info}/RECORD +25 -24
{langroid-0.1.100.dist-info → langroid-0.1.102.dist-info}/LICENSE +0 -0
{langroid-0.1.100.dist-info → langroid-0.1.102.dist-info}/WHEEL +0 -0

langroid/agent/base.py CHANGED Viewed

@@ -41,7 +41,7 @@ from langroid.utils.configuration import settings
 from langroid.utils.constants import NO_ANSWER
 from langroid.vector_store.base import VectorStore, VectorStoreConfig
-console = Console()
+console = Console(quiet=settings.quiet)
 logger = logging.getLogger(__name__)
@@ -278,8 +278,9 @@ class Agent(ABC):
             return None
         if isinstance(results, ChatDocument):
             return results
-        console.print(f"[red]{self.indent}", end="")
-        print(f"[red]Agent: {results}")
+        if not settings.quiet:
+            console.print(f"[red]{self.indent}", end="")
+            print(f"[red]Agent: {results}")
         sender_name = self.config.name
         if isinstance(msg, ChatDocument) and msg.function_call is not None:
             # if result was from handling an LLM `function_call`,
@@ -412,7 +413,7 @@ class Agent(ABC):
         with StreamingIfAllowed(self.llm, self.llm.get_stream()):
             response = await self.llm.agenerate(prompt, output_len)
-        if not self.llm.get_stream() or response.cached:
+        if not self.llm.get_stream() or response.cached and not settings.quiet:
             # We would have already displayed the msg "live" ONLY if
             # streaming was enabled, AND we did not find a cached response.
             # If we are here, it means the response has not yet been displayed.
@@ -422,7 +423,7 @@ class Agent(ABC):
             response,
             prompt,
             self.llm.get_stream(),
-            print_response_stats=self.config.show_stats,
+            print_response_stats=self.config.show_stats and not settings.quiet,
         )
         return ChatDocument.from_LLMResponse(response, displayed=True)
@@ -475,11 +476,11 @@ class Agent(ABC):
                     the completion context length of the LLM.
                     """
                     )
-            if self.llm.get_stream():
+            if self.llm.get_stream() and not settings.quiet:
                 console.print(f"[green]{self.indent}", end="")
             response = self.llm.generate(prompt, output_len)
-        if not self.llm.get_stream() or response.cached:
+        if not self.llm.get_stream() or response.cached and not settings.quiet:
             # we would have already displayed the msg "live" ONLY if
             # streaming was enabled, AND we did not find a cached response
             # If we are here, it means the response has not yet been displayed.
@@ -490,7 +491,7 @@ class Agent(ABC):
             response,
             prompt,
             self.llm.get_stream(),
-            print_response_stats=self.config.show_stats,
+            print_response_stats=self.config.show_stats and not settings.quiet,
         )
         return ChatDocument.from_LLMResponse(response, displayed=True)

langroid/agent/batch.py CHANGED Viewed

@@ -9,9 +9,10 @@ from rich.console import Console
 from langroid.agent.base import Agent
 from langroid.agent.chat_document import ChatDocument
 from langroid.agent.task import Task
+from langroid.utils.configuration import quiet_mode, settings
 from langroid.utils.logging import setup_colored_logging
-console = Console()
+console = Console(quiet=settings.quiet)
 setup_colored_logging()
@@ -52,9 +53,10 @@ def run_batch_tasks(
         return output_map(result)
     async def _do_all() -> List[Any]:
-        return await asyncio.gather(  # type: ignore
-            *(_do_task(input, i) for i, input in enumerate(inputs))
-        )
+        with quiet_mode():
+            return await asyncio.gather(  # type: ignore
+                *(_do_task(input, i) for i, input in enumerate(inputs))
+            )
     # show rich console spinner

langroid/agent/chat_agent.py CHANGED Viewed

@@ -20,7 +20,7 @@ from langroid.language_models.base import (
 from langroid.language_models.openai_gpt import OpenAIGPT
 from langroid.utils.configuration import settings
-console = Console()
+console = Console(quiet=settings.quiet)
 logger = logging.getLogger(__name__)
@@ -614,11 +614,11 @@ class ChatAgent(Agent):
         assert self.config.llm is not None and self.llm is not None
         output_len = output_len or self.config.llm.max_output_tokens
         with ExitStack() as stack:  # for conditionally using rich spinner
-            if not self.llm.get_stream():
+            if not self.llm.get_stream() and not settings.quiet:
                 # show rich spinner only if not streaming!
                 cm = console.status("LLM responding to messages...")
                 stack.enter_context(cm)
-            if self.llm.get_stream():
+            if self.llm.get_stream() and not settings.quiet:
                 console.print(f"[green]{self.indent}", end="")
             functions: Optional[List[LLMFunctionSpec]] = None
             fun_call: str | Dict[str, str] = "none"
@@ -647,12 +647,13 @@ class ChatAgent(Agent):
                 response_str = str(response.function_call)
             else:
                 response_str = response.message
-            print(cached + "[green]" + response_str)
+            if not settings.quiet:
+                print(cached + "[green]" + response_str)
         self.update_token_usage(
             response,
             messages,
             self.llm.get_stream(),
-            print_response_stats=self.config.show_stats,
+            print_response_stats=self.config.show_stats and not settings.quiet,
         )
         return ChatDocument.from_LLMResponse(response, displayed=True)
@@ -688,13 +689,14 @@ class ChatAgent(Agent):
                 response_str = str(response.function_call)
             else:
                 response_str = response.message
-            print(cached + "[green]" + response_str)
+            if not settings.quiet:
+                print(cached + "[green]" + response_str)
         self.update_token_usage(
             response,
             messages,
             self.llm.get_stream(),
-            print_response_stats=self.config.show_stats,
+            print_response_stats=self.config.show_stats and not settings.quiet,
         )
         return ChatDocument.from_LLMResponse(response, displayed=True)

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -21,8 +21,14 @@ from rich.console import Console
 from rich.prompt import Prompt
 from langroid.agent.base import Agent
+from langroid.agent.batch import run_batch_tasks
 from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
 from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
+from langroid.agent.special.relevance_extractor_agent import (
+    RelevanceExtractorAgent,
+    RelevanceExtractorAgentConfig,
+)
+from langroid.agent.task import Task
 from langroid.embedding_models.models import OpenAIEmbeddingsConfig
 from langroid.language_models.base import StreamingIfAllowed
 from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
@@ -60,6 +66,10 @@ You are a helpful assistant, helping me understand a collection of documents.
 """
+class DocChunkMetqdata(DocMetaData):
+    id: str
 class DocChatAgentConfig(ChatAgentConfig):
     """
     Attributes:
@@ -89,6 +99,7 @@ class DocChatAgentConfig(ChatAgentConfig):
     # It is False by default; its benefits depends on the context.
     hypothetical_answer: bool = False
     n_query_rephrases: int = 0
+    n_neighbor_chunks: int = 0  # how many neighbors on either side of match to retrieve
     use_fuzzy_match: bool = True
     use_bm25_search: bool = True
     cross_encoder_reranking_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@@ -116,6 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
         min_chunk_chars=200,
         discard_chunk_chars=5,  # discard chunks with fewer than this many chars
         n_similar_docs=3,
+        n_neighbor_ids=0,  # num chunk IDs to store on either side of each chunk
         pdf=PdfParsingConfig(
             # NOTE: PDF parsing is extremely challenging, and each library
             # has its own strengths and weaknesses.
@@ -189,6 +201,7 @@ class DocChatAgent(ChatAgent):
             if self.vecdb is None:
                 raise ValueError("VecDB not set")
             self.chunked_docs = self.vecdb.get_all_documents()
+            # used for lexical similarity e.g. keyword search (bm25 etc)
             self.chunked_docs_clean = [
                 Document(content=preprocess_text(d.content), metadata=d.metadata)
                 for d in self.chunked_docs
@@ -503,9 +516,13 @@ class DocChatAgent(ChatAgent):
             if self.chunked_docs is None:
                 logger.warning("No chunked docs; cannot use fuzzy matching")
                 return []
+            if self.chunked_docs_clean is None:
+                logger.warning("No cleaned chunked docs; cannot use fuzzy-search")
+                return []
             fuzzy_match_docs = find_fuzzy_matches_in_docs(
                 query,
                 self.chunked_docs,
+                self.chunked_docs_clean,
                 k=self.config.parsing.n_similar_docs * multiple,
                 words_before=1000,
                 words_after=1000,
@@ -540,6 +557,36 @@ class DocChatAgent(ChatAgent):
             ]
         return passages
+    def add_context_window(
+        self,
+        docs_scores: List[Tuple[Document, float]],
+    ) -> List[Tuple[Document, float]]:
+        """
+        In each doc's metadata, there may be a window_ids field indicating
+        the ids of the chunks around the current chunk.
+        These window_ids may overlap, so we
+        - gather connected-components of overlapping windows,
+        - split each component into roughly equal parts,
+        - create a new document for each part, preserving metadata,
+        We may have stored a longer set of window_ids than we need.
+        We just want `neighbors` on each side of the center of window_ids.
+        Args:
+            docs (List[Document]): List of documents to add context window to.
+            scores (List[float]): List of match scores for each document.
+            neighbors (int, optional): Number of neighbors on "each side" of match to
+                retrieve. Defaults to 0.
+                "Each side" here means before and after the match,
+                in the original text.
+        Returns:
+            List[Tuple[Document, float]]: List of (Document, score) tuples.
+        """
+        if self.vecdb is None or self.config.n_neighbor_chunks == 0:
+            return docs_scores
+        return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
     def get_relevant_chunks(
         self, query: str, query_proxies: List[str] = []
     ) -> List[Document]:
@@ -554,10 +601,11 @@ class DocChatAgent(ChatAgent):
         dynamically retrieved based on a window around a lexical match.
         These are the steps (some optional based on config):
-        - vector-embedding distance, from vecdb
-        - bm25-ranking (keyword similarity)
+        - semantic search based on vector-embedding distance, from vecdb
+        - lexical search using bm25-ranking (keyword similarity)
         - fuzzy matching (keyword similarity)
-        - re-ranking of doc-chunks using cross-encoder, pick top k
+        - re-ranking of doc-chunks by relevance to query, using cross-encoder,
+           and pick top k
         Args:
             query: original query (assumed to be in stand-alone form)
@@ -606,6 +654,9 @@ class DocChatAgent(ChatAgent):
         if len(passages) == 0:
             return []
+        passages_scores = [(p, 0.0) for p in passages]
+        passages_scores = self.add_context_window(passages_scores)
+        passages = [p for p, _ in passages_scores]
         # now passages can potentially have a lot of doc chunks,
         # so we re-rank them using a cross-encoder scoring model
         # https://www.sbert.net/examples/applications/retrieve_rerank
@@ -660,11 +711,56 @@ class DocChatAgent(ChatAgent):
         with console.status("[cyan]LLM Extracting verbatim passages..."):
             with StreamingIfAllowed(self.llm, False):
                 # these are async calls, one per passage; turn off streaming
-                extracts = self.llm.get_verbatim_extracts(query, passages)
+                extracts = self.get_verbatim_extracts(query, passages)
                 extracts = [e for e in extracts if e.content != NO_ANSWER]
         return query, extracts
+    def get_verbatim_extracts(
+        self,
+        query: str,
+        passages: List[Document],
+    ) -> List[Document]:
+        """
+        Run RelevanceExtractorAgent in async/concurrent mode on passages,
+        to extract portions relevant to answering query, from each passage.
+        Args:
+            query (str): query to answer
+            passages (List[Documents]): list of passages to extract from
+        Returns:
+            List[Document]: list of Documents containing extracts and metadata.
+        """
+        agent_cfg = RelevanceExtractorAgentConfig(
+            use_tools=False,
+            use_functions_api=True,
+            query=query,
+            segment_length=1,
+        )
+        agent_cfg.llm.stream = False  # disable streaming for concurrent calls
+        agent = RelevanceExtractorAgent(agent_cfg)
+        task = Task(
+            agent,
+            name="Relevance-Extractor",
+            default_human_response="",  # eliminate human response
+            only_user_quits_root=False,  # allow agent_response to quit via "DONE <msg>"
+        )
+        extracts = run_batch_tasks(
+            task,
+            passages,
+            input_map=lambda msg: msg.content,
+            output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
+        )
+        metadatas = [P.metadata for P in passages]
+        # return with metadata so we can use it downstream, e.g. to cite sources
+        return [
+            Document(content=e, metadata=m)
+            for e, m in zip(extracts, metadatas)
+            if (e != NO_ANSWER and len(e) > 0)
+        ]
     @no_type_check
     def answer_from_docs(self, query: str) -> Document:
         """

langroid/agent/special/relevance_extractor_agent.py CHANGED Viewed

@@ -13,6 +13,7 @@ from langroid.agent.chat_document import ChatDocument
 from langroid.agent.tools.segment_extract_tool import SegmentExtractTool
 from langroid.language_models.openai_gpt import OpenAIGPTConfig
 from langroid.parsing.utils import extract_numbered_segments, number_segments
+from langroid.utils.constants import NO_ANSWER
 console = Console()
 logger = logging.getLogger(__name__)
@@ -25,8 +26,8 @@ class RelevanceExtractorAgentConfig(ChatAgentConfig):
     system_message = """
     The user will give you a PASSAGE containing segments numbered as
     <#1#>, <#2#>, <#3#>, etc.,
-    followed by a QUERY. Your task is to extract the segment-numbers from the PASSAGE
-    that are relevant to the QUERY. You must use the `extract_segments`
+    followed by a QUERY. Your task is to extract ALL and ONLY the segment-numbers from
+    the PASSAGE that are RELEVANT to the QUERY. You must use the `extract_segments`
     tool/function to present your answer, by setting the `segment_list` field
     to a list of segment numbers or ranges, like "10,12,14-17".
     """
@@ -72,12 +73,17 @@ class RelevanceExtractorAgent(ChatAgent):
     async def llm_response_async(
         self, message: Optional[str | ChatDocument] = None
     ) -> Optional[ChatDocument]:
-        """Compose a prompt asking to extract relevant segments from a passage.
+        """
+        Compose a prompt asking to extract relevant segments from a passage.
         Steps:
         - number the segments in the passage
         - compose prompt
         - send to LLM
+        The LLM is expected to generate a structured msg according to the
+        SegmentExtractTool schema, i.e. it should contain a `segment_list` field
+        whose value is a list of segment numbers or ranges, like "10,12,14-17".
         """
         assert self.config.query is not None, "No query specified"
         assert message is not None, "No message specified"
         message_str = message.content if isinstance(message, ChatDocument) else message
@@ -97,9 +103,9 @@ class RelevanceExtractorAgent(ChatAgent):
         """Method to handle a segmentExtractTool message from LLM"""
         spec = msg.segment_list
         if len(self.message_history) == 0:
-            return ""
+            return NO_ANSWER
         if spec is None or spec.strip() == "":
-            return ""
+            return NO_ANSWER
         assert self.numbered_passage is not None, "No numbered passage"
         # assume this has numbered segments
         extracts = extract_numbered_segments(self.numbered_passage, spec)

langroid/agent/special/retriever_agent.py CHANGED Viewed

@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
 class RecordMetadata(DocMetaData):
-    id: None | int | str = None
+    id: None | str = None
 class RecordDoc(Document):

langroid/agent/task.py CHANGED Viewed

@@ -310,7 +310,7 @@ class Task:
         while True:
             self.step()
             if self.done():
-                if self._level == 0:
+                if self._level == 0 and not settings.quiet:
                     print("[magenta]Bye, hope this was useful!")
                 break
             i += 1
@@ -370,7 +370,7 @@ class Task:
         while True:
             await self.step_async()
             if self.done():
-                if self._level == 0:
+                if self._level == 0 and not settings.quiet:
                     print("[magenta]Bye, hope this was useful!")
                 break
             i += 1
@@ -411,10 +411,12 @@ class Task:
             if self.agent.config.llm is None
             else self.agent.config.llm.chat_model
         )
-        print(
-            f"[bold magenta]{self._enter} Starting Agent "
-            f"{self.name} ({self.message_history_idx+1}) {llm_model} [/bold magenta]"
-        )
+        if not settings.quiet:
+            print(
+                f"[bold magenta]{self._enter} Starting Agent "
+                f"{self.name} ({self.message_history_idx+1}) "
+                f"{llm_model} [/bold magenta]"
+            )
     def _post_run_loop(self) -> None:
         # delete all messages from our agent's history, AFTER the first incoming
@@ -437,10 +439,11 @@ class Task:
                 # ONLY talking to the current agent.
                 if isinstance(t.agent, ChatAgent):
                     t.agent.clear_history(0)
-        print(
-            f"[bold magenta]{self._leave} Finished Agent "
-            f"{self.name} ({n_messages}) [/bold magenta]"
-        )
+        if not settings.quiet:
+            print(
+                f"[bold magenta]{self._leave} Finished Agent "
+                f"{self.name} ({n_messages}) [/bold magenta]"
+            )
     def step(self, turns: int = -1) -> ChatDocument | None:
         """

langroid/mytypes.py CHANGED Viewed

@@ -26,6 +26,8 @@ class DocMetaData(BaseModel):
     source: str = "context"
     is_chunk: bool = False  # if it is a chunk, don't split
+    id: str | None = None  # unique id for the document
+    window_ids: List[str] = []  # for RAG: ids of chunks around this one
     def dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
         """
@@ -51,9 +53,10 @@ class Document(BaseModel):
     content: str
     metadata: DocMetaData
-    def _unique_hash_id(self) -> str:
+    @staticmethod
+    def hash_id(doc: str) -> str:
         # Encode the document as UTF-8
-        doc_utf8 = str(self).encode("utf-8")
+        doc_utf8 = str(doc).encode("utf-8")
         # Create a SHA256 hash object
         sha256_hash = hashlib.sha256()
@@ -69,8 +72,11 @@ class Document(BaseModel):
         return str(hash_uuid)
-    def id(self) -> Any:
-        if hasattr(self.metadata, "id"):
+    def _unique_hash_id(self) -> str:
+        return self.hash_id(str(self))
+    def id(self) -> str:
+        if hasattr(self.metadata, "id") and self.metadata.id is not None:
             return self.metadata.id
         else:
             return self._unique_hash_id()

langroid/parsing/document_parser.py CHANGED Viewed

@@ -200,6 +200,7 @@ class DocumentParser(Parser):
                     ),
                 )
             )
+        self.add_window_ids(docs)
         return docs

langroid/parsing/parser.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from enum import Enum
-from functools import reduce
 from typing import List
 import tiktoken
@@ -36,6 +35,7 @@ class ParsingConfig(BaseSettings):
     min_chunk_chars: int = 350
     discard_chunk_chars: int = 5  # discard chunks with fewer than this many chars
     n_similar_docs: int = 4
+    n_neighbor_ids: int = 0  # window size to store around each chunk
     separators: List[str] = ["\n\n", "\n", " ", ""]
     token_encoding_model: str = "text-embedding-ada-002"
     pdf: PdfParsingConfig = PdfParsingConfig()
@@ -51,17 +51,42 @@ class Parser:
         tokens = self.tokenizer.encode(text)
         return len(tokens)
+    def add_window_ids(self, chunks: List[Document]) -> None:
+        """Chunks are consecutive parts of a single original document.
+        Add window_ids in metadata"""
+        # The original metadata.id (if any) is ignored since it will be same for all
+        # chunks and is useless. We want a distinct id for each chunk.
+        ids = [Document.hash_id(str(c)) for c in chunks]
+        k = self.config.n_neighbor_ids
+        n = len(ids)
+        window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
+        for i, c in enumerate(chunks):
+            if c.content.strip() == "":
+                continue
+            c.metadata.window_ids = window_ids[i]
+            c.metadata.id = ids[i]
+            c.metadata.is_chunk = True
     def split_simple(self, docs: List[Document]) -> List[Document]:
         if len(self.config.separators) == 0:
             raise ValueError("Must have at least one separator")
-        return [
-            Document(content=chunk.strip(), metadata=d.metadata)
-            for d in docs
-            for chunk in remove_extra_whitespace(d.content).split(
-                self.config.separators[0]
-            )
-            if chunk.strip() != ""
-        ]
+        final_docs = []
+        for d in docs:
+            if d.content.strip() == "":
+                continue
+            chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
+                )
+                for c in chunks
+                if c.strip() != ""
+            ]
+            self.add_window_ids(chunk_docs)
+            final_docs += chunk_docs
+        return final_docs
     def split_para_sentence(self, docs: List[Document]) -> List[Document]:
         final_chunks = []
@@ -95,28 +120,37 @@ class Parser:
         return final_chunks + chunks
     def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
-        chunked_docs = [
-            [
-                Document(content=chunk.strip(), metadata=d.metadata)
-                for chunk in create_chunks(
-                    d.content, self.config.chunk_size, self.num_tokens
+        final_chunks = []
+        for d in docs:
+            if d.content.strip() == "":
+                continue
+            chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
                 )
-                if chunk.strip() != ""
+                for c in chunks
+                if c.strip() != ""
             ]
-            for d in docs
-        ]
-        return reduce(lambda x, y: x + y, chunked_docs)
+            self.add_window_ids(chunk_docs)
+            final_chunks += chunk_docs
+        return final_chunks
     def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
-        chunked_docs = [
-            [
-                Document(content=chunk.strip(), metadata=d.metadata)
-                for chunk in self.chunk_tokens(d.content)
-                if chunk.strip() != ""
+        final_docs = []
+        for d in docs:
+            chunks = self.chunk_tokens(d.content)
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
+                )
+                for c in chunks
+                if c.strip() != ""
             ]
-            for d in docs
-        ]
-        return reduce(lambda x, y: x + y, chunked_docs)
+            self.add_window_ids(chunk_docs)
+            final_docs += chunk_docs
+        return final_docs
     def chunk_tokens(
         self,
@@ -198,11 +232,8 @@ class Parser:
             # Increment the number of chunks
             num_chunks += 1
-        # Handle the remaining tokens
-        if tokens:
-            remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
-            if len(remaining_text) > self.config.discard_chunk_chars:
-                chunks.append(remaining_text)
+        # There may be remaining tokens, but we discard them
+        # since we have already reached the maximum number of chunks
         return chunks

langroid 0.1.100__py3-none-any.whl → 0.1.102__py3-none-any.whl

langroid 0.1.100py3-none-any.whl → 0.1.102py3-none-any.whl