langroid 0.1.100__py3-none-any.whl → 0.1.102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/agent/base.py CHANGED
@@ -41,7 +41,7 @@ from langroid.utils.configuration import settings
41
41
  from langroid.utils.constants import NO_ANSWER
42
42
  from langroid.vector_store.base import VectorStore, VectorStoreConfig
43
43
 
44
- console = Console()
44
+ console = Console(quiet=settings.quiet)
45
45
 
46
46
  logger = logging.getLogger(__name__)
47
47
 
@@ -278,8 +278,9 @@ class Agent(ABC):
278
278
  return None
279
279
  if isinstance(results, ChatDocument):
280
280
  return results
281
- console.print(f"[red]{self.indent}", end="")
282
- print(f"[red]Agent: {results}")
281
+ if not settings.quiet:
282
+ console.print(f"[red]{self.indent}", end="")
283
+ print(f"[red]Agent: {results}")
283
284
  sender_name = self.config.name
284
285
  if isinstance(msg, ChatDocument) and msg.function_call is not None:
285
286
  # if result was from handling an LLM `function_call`,
@@ -412,7 +413,7 @@ class Agent(ABC):
412
413
  with StreamingIfAllowed(self.llm, self.llm.get_stream()):
413
414
  response = await self.llm.agenerate(prompt, output_len)
414
415
 
415
- if not self.llm.get_stream() or response.cached:
416
+ if not self.llm.get_stream() or response.cached and not settings.quiet:
416
417
  # We would have already displayed the msg "live" ONLY if
417
418
  # streaming was enabled, AND we did not find a cached response.
418
419
  # If we are here, it means the response has not yet been displayed.
@@ -422,7 +423,7 @@ class Agent(ABC):
422
423
  response,
423
424
  prompt,
424
425
  self.llm.get_stream(),
425
- print_response_stats=self.config.show_stats,
426
+ print_response_stats=self.config.show_stats and not settings.quiet,
426
427
  )
427
428
  return ChatDocument.from_LLMResponse(response, displayed=True)
428
429
 
@@ -475,11 +476,11 @@ class Agent(ABC):
475
476
  the completion context length of the LLM.
476
477
  """
477
478
  )
478
- if self.llm.get_stream():
479
+ if self.llm.get_stream() and not settings.quiet:
479
480
  console.print(f"[green]{self.indent}", end="")
480
481
  response = self.llm.generate(prompt, output_len)
481
482
 
482
- if not self.llm.get_stream() or response.cached:
483
+ if not self.llm.get_stream() or response.cached and not settings.quiet:
483
484
  # we would have already displayed the msg "live" ONLY if
484
485
  # streaming was enabled, AND we did not find a cached response
485
486
  # If we are here, it means the response has not yet been displayed.
@@ -490,7 +491,7 @@ class Agent(ABC):
490
491
  response,
491
492
  prompt,
492
493
  self.llm.get_stream(),
493
- print_response_stats=self.config.show_stats,
494
+ print_response_stats=self.config.show_stats and not settings.quiet,
494
495
  )
495
496
  return ChatDocument.from_LLMResponse(response, displayed=True)
496
497
 
langroid/agent/batch.py CHANGED
@@ -9,9 +9,10 @@ from rich.console import Console
9
9
  from langroid.agent.base import Agent
10
10
  from langroid.agent.chat_document import ChatDocument
11
11
  from langroid.agent.task import Task
12
+ from langroid.utils.configuration import quiet_mode, settings
12
13
  from langroid.utils.logging import setup_colored_logging
13
14
 
14
- console = Console()
15
+ console = Console(quiet=settings.quiet)
15
16
 
16
17
  setup_colored_logging()
17
18
 
@@ -52,9 +53,10 @@ def run_batch_tasks(
52
53
  return output_map(result)
53
54
 
54
55
  async def _do_all() -> List[Any]:
55
- return await asyncio.gather( # type: ignore
56
- *(_do_task(input, i) for i, input in enumerate(inputs))
57
- )
56
+ with quiet_mode():
57
+ return await asyncio.gather( # type: ignore
58
+ *(_do_task(input, i) for i, input in enumerate(inputs))
59
+ )
58
60
 
59
61
  # show rich console spinner
60
62
 
@@ -20,7 +20,7 @@ from langroid.language_models.base import (
20
20
  from langroid.language_models.openai_gpt import OpenAIGPT
21
21
  from langroid.utils.configuration import settings
22
22
 
23
- console = Console()
23
+ console = Console(quiet=settings.quiet)
24
24
 
25
25
  logger = logging.getLogger(__name__)
26
26
 
@@ -614,11 +614,11 @@ class ChatAgent(Agent):
614
614
  assert self.config.llm is not None and self.llm is not None
615
615
  output_len = output_len or self.config.llm.max_output_tokens
616
616
  with ExitStack() as stack: # for conditionally using rich spinner
617
- if not self.llm.get_stream():
617
+ if not self.llm.get_stream() and not settings.quiet:
618
618
  # show rich spinner only if not streaming!
619
619
  cm = console.status("LLM responding to messages...")
620
620
  stack.enter_context(cm)
621
- if self.llm.get_stream():
621
+ if self.llm.get_stream() and not settings.quiet:
622
622
  console.print(f"[green]{self.indent}", end="")
623
623
  functions: Optional[List[LLMFunctionSpec]] = None
624
624
  fun_call: str | Dict[str, str] = "none"
@@ -647,12 +647,13 @@ class ChatAgent(Agent):
647
647
  response_str = str(response.function_call)
648
648
  else:
649
649
  response_str = response.message
650
- print(cached + "[green]" + response_str)
650
+ if not settings.quiet:
651
+ print(cached + "[green]" + response_str)
651
652
  self.update_token_usage(
652
653
  response,
653
654
  messages,
654
655
  self.llm.get_stream(),
655
- print_response_stats=self.config.show_stats,
656
+ print_response_stats=self.config.show_stats and not settings.quiet,
656
657
  )
657
658
  return ChatDocument.from_LLMResponse(response, displayed=True)
658
659
 
@@ -688,13 +689,14 @@ class ChatAgent(Agent):
688
689
  response_str = str(response.function_call)
689
690
  else:
690
691
  response_str = response.message
691
- print(cached + "[green]" + response_str)
692
+ if not settings.quiet:
693
+ print(cached + "[green]" + response_str)
692
694
 
693
695
  self.update_token_usage(
694
696
  response,
695
697
  messages,
696
698
  self.llm.get_stream(),
697
- print_response_stats=self.config.show_stats,
699
+ print_response_stats=self.config.show_stats and not settings.quiet,
698
700
  )
699
701
  return ChatDocument.from_LLMResponse(response, displayed=True)
700
702
 
@@ -21,8 +21,14 @@ from rich.console import Console
21
21
  from rich.prompt import Prompt
22
22
 
23
23
  from langroid.agent.base import Agent
24
+ from langroid.agent.batch import run_batch_tasks
24
25
  from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
25
26
  from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
27
+ from langroid.agent.special.relevance_extractor_agent import (
28
+ RelevanceExtractorAgent,
29
+ RelevanceExtractorAgentConfig,
30
+ )
31
+ from langroid.agent.task import Task
26
32
  from langroid.embedding_models.models import OpenAIEmbeddingsConfig
27
33
  from langroid.language_models.base import StreamingIfAllowed
28
34
  from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
@@ -60,6 +66,10 @@ You are a helpful assistant, helping me understand a collection of documents.
60
66
  """
61
67
 
62
68
 
69
+ class DocChunkMetqdata(DocMetaData):
70
+ id: str
71
+
72
+
63
73
  class DocChatAgentConfig(ChatAgentConfig):
64
74
  """
65
75
  Attributes:
@@ -89,6 +99,7 @@ class DocChatAgentConfig(ChatAgentConfig):
89
99
  # It is False by default; its benefits depends on the context.
90
100
  hypothetical_answer: bool = False
91
101
  n_query_rephrases: int = 0
102
+ n_neighbor_chunks: int = 0 # how many neighbors on either side of match to retrieve
92
103
  use_fuzzy_match: bool = True
93
104
  use_bm25_search: bool = True
94
105
  cross_encoder_reranking_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@@ -116,6 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
116
127
  min_chunk_chars=200,
117
128
  discard_chunk_chars=5, # discard chunks with fewer than this many chars
118
129
  n_similar_docs=3,
130
+ n_neighbor_ids=0, # num chunk IDs to store on either side of each chunk
119
131
  pdf=PdfParsingConfig(
120
132
  # NOTE: PDF parsing is extremely challenging, and each library
121
133
  # has its own strengths and weaknesses.
@@ -189,6 +201,7 @@ class DocChatAgent(ChatAgent):
189
201
  if self.vecdb is None:
190
202
  raise ValueError("VecDB not set")
191
203
  self.chunked_docs = self.vecdb.get_all_documents()
204
+ # used for lexical similarity e.g. keyword search (bm25 etc)
192
205
  self.chunked_docs_clean = [
193
206
  Document(content=preprocess_text(d.content), metadata=d.metadata)
194
207
  for d in self.chunked_docs
@@ -503,9 +516,13 @@ class DocChatAgent(ChatAgent):
503
516
  if self.chunked_docs is None:
504
517
  logger.warning("No chunked docs; cannot use fuzzy matching")
505
518
  return []
519
+ if self.chunked_docs_clean is None:
520
+ logger.warning("No cleaned chunked docs; cannot use fuzzy-search")
521
+ return []
506
522
  fuzzy_match_docs = find_fuzzy_matches_in_docs(
507
523
  query,
508
524
  self.chunked_docs,
525
+ self.chunked_docs_clean,
509
526
  k=self.config.parsing.n_similar_docs * multiple,
510
527
  words_before=1000,
511
528
  words_after=1000,
@@ -540,6 +557,36 @@ class DocChatAgent(ChatAgent):
540
557
  ]
541
558
  return passages
542
559
 
560
+ def add_context_window(
561
+ self,
562
+ docs_scores: List[Tuple[Document, float]],
563
+ ) -> List[Tuple[Document, float]]:
564
+ """
565
+ In each doc's metadata, there may be a window_ids field indicating
566
+ the ids of the chunks around the current chunk.
567
+ These window_ids may overlap, so we
568
+ - gather connected-components of overlapping windows,
569
+ - split each component into roughly equal parts,
570
+ - create a new document for each part, preserving metadata,
571
+
572
+ We may have stored a longer set of window_ids than we need.
573
+ We just want `neighbors` on each side of the center of window_ids.
574
+
575
+ Args:
576
+ docs (List[Document]): List of documents to add context window to.
577
+ scores (List[float]): List of match scores for each document.
578
+ neighbors (int, optional): Number of neighbors on "each side" of match to
579
+ retrieve. Defaults to 0.
580
+ "Each side" here means before and after the match,
581
+ in the original text.
582
+
583
+ Returns:
584
+ List[Tuple[Document, float]]: List of (Document, score) tuples.
585
+ """
586
+ if self.vecdb is None or self.config.n_neighbor_chunks == 0:
587
+ return docs_scores
588
+ return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
589
+
543
590
  def get_relevant_chunks(
544
591
  self, query: str, query_proxies: List[str] = []
545
592
  ) -> List[Document]:
@@ -554,10 +601,11 @@ class DocChatAgent(ChatAgent):
554
601
  dynamically retrieved based on a window around a lexical match.
555
602
 
556
603
  These are the steps (some optional based on config):
557
- - vector-embedding distance, from vecdb
558
- - bm25-ranking (keyword similarity)
604
+ - semantic search based on vector-embedding distance, from vecdb
605
+ - lexical search using bm25-ranking (keyword similarity)
559
606
  - fuzzy matching (keyword similarity)
560
- - re-ranking of doc-chunks using cross-encoder, pick top k
607
+ - re-ranking of doc-chunks by relevance to query, using cross-encoder,
608
+ and pick top k
561
609
 
562
610
  Args:
563
611
  query: original query (assumed to be in stand-alone form)
@@ -606,6 +654,9 @@ class DocChatAgent(ChatAgent):
606
654
  if len(passages) == 0:
607
655
  return []
608
656
 
657
+ passages_scores = [(p, 0.0) for p in passages]
658
+ passages_scores = self.add_context_window(passages_scores)
659
+ passages = [p for p, _ in passages_scores]
609
660
  # now passages can potentially have a lot of doc chunks,
610
661
  # so we re-rank them using a cross-encoder scoring model
611
662
  # https://www.sbert.net/examples/applications/retrieve_rerank
@@ -660,11 +711,56 @@ class DocChatAgent(ChatAgent):
660
711
  with console.status("[cyan]LLM Extracting verbatim passages..."):
661
712
  with StreamingIfAllowed(self.llm, False):
662
713
  # these are async calls, one per passage; turn off streaming
663
- extracts = self.llm.get_verbatim_extracts(query, passages)
714
+ extracts = self.get_verbatim_extracts(query, passages)
664
715
  extracts = [e for e in extracts if e.content != NO_ANSWER]
665
716
 
666
717
  return query, extracts
667
718
 
719
+ def get_verbatim_extracts(
720
+ self,
721
+ query: str,
722
+ passages: List[Document],
723
+ ) -> List[Document]:
724
+ """
725
+ Run RelevanceExtractorAgent in async/concurrent mode on passages,
726
+ to extract portions relevant to answering query, from each passage.
727
+ Args:
728
+ query (str): query to answer
729
+ passages (List[Documents]): list of passages to extract from
730
+
731
+ Returns:
732
+ List[Document]: list of Documents containing extracts and metadata.
733
+ """
734
+ agent_cfg = RelevanceExtractorAgentConfig(
735
+ use_tools=False,
736
+ use_functions_api=True,
737
+ query=query,
738
+ segment_length=1,
739
+ )
740
+ agent_cfg.llm.stream = False # disable streaming for concurrent calls
741
+
742
+ agent = RelevanceExtractorAgent(agent_cfg)
743
+ task = Task(
744
+ agent,
745
+ name="Relevance-Extractor",
746
+ default_human_response="", # eliminate human response
747
+ only_user_quits_root=False, # allow agent_response to quit via "DONE <msg>"
748
+ )
749
+
750
+ extracts = run_batch_tasks(
751
+ task,
752
+ passages,
753
+ input_map=lambda msg: msg.content,
754
+ output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
755
+ )
756
+ metadatas = [P.metadata for P in passages]
757
+ # return with metadata so we can use it downstream, e.g. to cite sources
758
+ return [
759
+ Document(content=e, metadata=m)
760
+ for e, m in zip(extracts, metadatas)
761
+ if (e != NO_ANSWER and len(e) > 0)
762
+ ]
763
+
668
764
  @no_type_check
669
765
  def answer_from_docs(self, query: str) -> Document:
670
766
  """
@@ -13,6 +13,7 @@ from langroid.agent.chat_document import ChatDocument
13
13
  from langroid.agent.tools.segment_extract_tool import SegmentExtractTool
14
14
  from langroid.language_models.openai_gpt import OpenAIGPTConfig
15
15
  from langroid.parsing.utils import extract_numbered_segments, number_segments
16
+ from langroid.utils.constants import NO_ANSWER
16
17
 
17
18
  console = Console()
18
19
  logger = logging.getLogger(__name__)
@@ -25,8 +26,8 @@ class RelevanceExtractorAgentConfig(ChatAgentConfig):
25
26
  system_message = """
26
27
  The user will give you a PASSAGE containing segments numbered as
27
28
  <#1#>, <#2#>, <#3#>, etc.,
28
- followed by a QUERY. Your task is to extract the segment-numbers from the PASSAGE
29
- that are relevant to the QUERY. You must use the `extract_segments`
29
+ followed by a QUERY. Your task is to extract ALL and ONLY the segment-numbers from
30
+ the PASSAGE that are RELEVANT to the QUERY. You must use the `extract_segments`
30
31
  tool/function to present your answer, by setting the `segment_list` field
31
32
  to a list of segment numbers or ranges, like "10,12,14-17".
32
33
  """
@@ -72,12 +73,17 @@ class RelevanceExtractorAgent(ChatAgent):
72
73
  async def llm_response_async(
73
74
  self, message: Optional[str | ChatDocument] = None
74
75
  ) -> Optional[ChatDocument]:
75
- """Compose a prompt asking to extract relevant segments from a passage.
76
+ """
77
+ Compose a prompt asking to extract relevant segments from a passage.
76
78
  Steps:
77
79
  - number the segments in the passage
78
80
  - compose prompt
79
81
  - send to LLM
82
+ The LLM is expected to generate a structured msg according to the
83
+ SegmentExtractTool schema, i.e. it should contain a `segment_list` field
84
+ whose value is a list of segment numbers or ranges, like "10,12,14-17".
80
85
  """
86
+
81
87
  assert self.config.query is not None, "No query specified"
82
88
  assert message is not None, "No message specified"
83
89
  message_str = message.content if isinstance(message, ChatDocument) else message
@@ -97,9 +103,9 @@ class RelevanceExtractorAgent(ChatAgent):
97
103
  """Method to handle a segmentExtractTool message from LLM"""
98
104
  spec = msg.segment_list
99
105
  if len(self.message_history) == 0:
100
- return ""
106
+ return NO_ANSWER
101
107
  if spec is None or spec.strip() == "":
102
- return ""
108
+ return NO_ANSWER
103
109
  assert self.numbered_passage is not None, "No numbered passage"
104
110
  # assume this has numbered segments
105
111
  extracts = extract_numbered_segments(self.numbered_passage, spec)
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  class RecordMetadata(DocMetaData):
31
- id: None | int | str = None
31
+ id: None | str = None
32
32
 
33
33
 
34
34
  class RecordDoc(Document):
langroid/agent/task.py CHANGED
@@ -310,7 +310,7 @@ class Task:
310
310
  while True:
311
311
  self.step()
312
312
  if self.done():
313
- if self._level == 0:
313
+ if self._level == 0 and not settings.quiet:
314
314
  print("[magenta]Bye, hope this was useful!")
315
315
  break
316
316
  i += 1
@@ -370,7 +370,7 @@ class Task:
370
370
  while True:
371
371
  await self.step_async()
372
372
  if self.done():
373
- if self._level == 0:
373
+ if self._level == 0 and not settings.quiet:
374
374
  print("[magenta]Bye, hope this was useful!")
375
375
  break
376
376
  i += 1
@@ -411,10 +411,12 @@ class Task:
411
411
  if self.agent.config.llm is None
412
412
  else self.agent.config.llm.chat_model
413
413
  )
414
- print(
415
- f"[bold magenta]{self._enter} Starting Agent "
416
- f"{self.name} ({self.message_history_idx+1}) {llm_model} [/bold magenta]"
417
- )
414
+ if not settings.quiet:
415
+ print(
416
+ f"[bold magenta]{self._enter} Starting Agent "
417
+ f"{self.name} ({self.message_history_idx+1}) "
418
+ f"{llm_model} [/bold magenta]"
419
+ )
418
420
 
419
421
  def _post_run_loop(self) -> None:
420
422
  # delete all messages from our agent's history, AFTER the first incoming
@@ -437,10 +439,11 @@ class Task:
437
439
  # ONLY talking to the current agent.
438
440
  if isinstance(t.agent, ChatAgent):
439
441
  t.agent.clear_history(0)
440
- print(
441
- f"[bold magenta]{self._leave} Finished Agent "
442
- f"{self.name} ({n_messages}) [/bold magenta]"
443
- )
442
+ if not settings.quiet:
443
+ print(
444
+ f"[bold magenta]{self._leave} Finished Agent "
445
+ f"{self.name} ({n_messages}) [/bold magenta]"
446
+ )
444
447
 
445
448
  def step(self, turns: int = -1) -> ChatDocument | None:
446
449
  """
langroid/mytypes.py CHANGED
@@ -26,6 +26,8 @@ class DocMetaData(BaseModel):
26
26
 
27
27
  source: str = "context"
28
28
  is_chunk: bool = False # if it is a chunk, don't split
29
+ id: str | None = None # unique id for the document
30
+ window_ids: List[str] = [] # for RAG: ids of chunks around this one
29
31
 
30
32
  def dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
31
33
  """
@@ -51,9 +53,10 @@ class Document(BaseModel):
51
53
  content: str
52
54
  metadata: DocMetaData
53
55
 
54
- def _unique_hash_id(self) -> str:
56
+ @staticmethod
57
+ def hash_id(doc: str) -> str:
55
58
  # Encode the document as UTF-8
56
- doc_utf8 = str(self).encode("utf-8")
59
+ doc_utf8 = str(doc).encode("utf-8")
57
60
 
58
61
  # Create a SHA256 hash object
59
62
  sha256_hash = hashlib.sha256()
@@ -69,8 +72,11 @@ class Document(BaseModel):
69
72
 
70
73
  return str(hash_uuid)
71
74
 
72
- def id(self) -> Any:
73
- if hasattr(self.metadata, "id"):
75
+ def _unique_hash_id(self) -> str:
76
+ return self.hash_id(str(self))
77
+
78
+ def id(self) -> str:
79
+ if hasattr(self.metadata, "id") and self.metadata.id is not None:
74
80
  return self.metadata.id
75
81
  else:
76
82
  return self._unique_hash_id()
@@ -200,6 +200,7 @@ class DocumentParser(Parser):
200
200
  ),
201
201
  )
202
202
  )
203
+ self.add_window_ids(docs)
203
204
  return docs
204
205
 
205
206
 
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from enum import Enum
3
- from functools import reduce
4
3
  from typing import List
5
4
 
6
5
  import tiktoken
@@ -36,6 +35,7 @@ class ParsingConfig(BaseSettings):
36
35
  min_chunk_chars: int = 350
37
36
  discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
38
37
  n_similar_docs: int = 4
38
+ n_neighbor_ids: int = 0 # window size to store around each chunk
39
39
  separators: List[str] = ["\n\n", "\n", " ", ""]
40
40
  token_encoding_model: str = "text-embedding-ada-002"
41
41
  pdf: PdfParsingConfig = PdfParsingConfig()
@@ -51,17 +51,42 @@ class Parser:
51
51
  tokens = self.tokenizer.encode(text)
52
52
  return len(tokens)
53
53
 
54
+ def add_window_ids(self, chunks: List[Document]) -> None:
55
+ """Chunks are consecutive parts of a single original document.
56
+ Add window_ids in metadata"""
57
+
58
+ # The original metadata.id (if any) is ignored since it will be same for all
59
+ # chunks and is useless. We want a distinct id for each chunk.
60
+ ids = [Document.hash_id(str(c)) for c in chunks]
61
+
62
+ k = self.config.n_neighbor_ids
63
+ n = len(ids)
64
+ window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
65
+ for i, c in enumerate(chunks):
66
+ if c.content.strip() == "":
67
+ continue
68
+ c.metadata.window_ids = window_ids[i]
69
+ c.metadata.id = ids[i]
70
+ c.metadata.is_chunk = True
71
+
54
72
  def split_simple(self, docs: List[Document]) -> List[Document]:
55
73
  if len(self.config.separators) == 0:
56
74
  raise ValueError("Must have at least one separator")
57
- return [
58
- Document(content=chunk.strip(), metadata=d.metadata)
59
- for d in docs
60
- for chunk in remove_extra_whitespace(d.content).split(
61
- self.config.separators[0]
62
- )
63
- if chunk.strip() != ""
64
- ]
75
+ final_docs = []
76
+ for d in docs:
77
+ if d.content.strip() == "":
78
+ continue
79
+ chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
80
+ chunk_docs = [
81
+ Document(
82
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
83
+ )
84
+ for c in chunks
85
+ if c.strip() != ""
86
+ ]
87
+ self.add_window_ids(chunk_docs)
88
+ final_docs += chunk_docs
89
+ return final_docs
65
90
 
66
91
  def split_para_sentence(self, docs: List[Document]) -> List[Document]:
67
92
  final_chunks = []
@@ -95,28 +120,37 @@ class Parser:
95
120
  return final_chunks + chunks
96
121
 
97
122
  def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
98
- chunked_docs = [
99
- [
100
- Document(content=chunk.strip(), metadata=d.metadata)
101
- for chunk in create_chunks(
102
- d.content, self.config.chunk_size, self.num_tokens
123
+ final_chunks = []
124
+ for d in docs:
125
+ if d.content.strip() == "":
126
+ continue
127
+ chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
128
+ chunk_docs = [
129
+ Document(
130
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
103
131
  )
104
- if chunk.strip() != ""
132
+ for c in chunks
133
+ if c.strip() != ""
105
134
  ]
106
- for d in docs
107
- ]
108
- return reduce(lambda x, y: x + y, chunked_docs)
135
+ self.add_window_ids(chunk_docs)
136
+ final_chunks += chunk_docs
137
+
138
+ return final_chunks
109
139
 
110
140
  def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
111
- chunked_docs = [
112
- [
113
- Document(content=chunk.strip(), metadata=d.metadata)
114
- for chunk in self.chunk_tokens(d.content)
115
- if chunk.strip() != ""
141
+ final_docs = []
142
+ for d in docs:
143
+ chunks = self.chunk_tokens(d.content)
144
+ chunk_docs = [
145
+ Document(
146
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
147
+ )
148
+ for c in chunks
149
+ if c.strip() != ""
116
150
  ]
117
- for d in docs
118
- ]
119
- return reduce(lambda x, y: x + y, chunked_docs)
151
+ self.add_window_ids(chunk_docs)
152
+ final_docs += chunk_docs
153
+ return final_docs
120
154
 
121
155
  def chunk_tokens(
122
156
  self,
@@ -198,11 +232,8 @@ class Parser:
198
232
  # Increment the number of chunks
199
233
  num_chunks += 1
200
234
 
201
- # Handle the remaining tokens
202
- if tokens:
203
- remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
204
- if len(remaining_text) > self.config.discard_chunk_chars:
205
- chunks.append(remaining_text)
235
+ # There may be remaining tokens, but we discard them
236
+ # since we have already reached the maximum number of chunks
206
237
 
207
238
  return chunks
208
239