langroid 0.37.1__tar.gz → 0.37.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. {langroid-0.37.1 → langroid-0.37.3}/PKG-INFO +1 -2
  2. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/doc_chat_agent.py +5 -10
  3. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/models.py +2 -2
  4. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/document_parser.py +58 -14
  5. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/parser.py +1 -1
  6. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/pdf_utils.py +15 -19
  7. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/search.py +21 -4
  8. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/weaviatedb.py +4 -4
  9. {langroid-0.37.1 → langroid-0.37.3}/pyproject.toml +1 -2
  10. {langroid-0.37.1 → langroid-0.37.3}/.gitignore +0 -0
  11. {langroid-0.37.1 → langroid-0.37.3}/LICENSE +0 -0
  12. {langroid-0.37.1 → langroid-0.37.3}/README.md +0 -0
  13. {langroid-0.37.1 → langroid-0.37.3}/langroid/__init__.py +0 -0
  14. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/__init__.py +0 -0
  15. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/base.py +0 -0
  16. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/batch.py +0 -0
  17. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/callbacks/__init__.py +0 -0
  18. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/callbacks/chainlit.py +0 -0
  19. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/chat_agent.py +0 -0
  20. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/chat_document.py +0 -0
  21. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/openai_assistant.py +0 -0
  22. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/__init__.py +0 -0
  23. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/arangodb/__init__.py +0 -0
  24. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  25. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/arangodb/system_messages.py +0 -0
  26. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/arangodb/tools.py +0 -0
  27. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/arangodb/utils.py +0 -0
  28. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  29. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/lance_rag/__init__.py +0 -0
  30. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  31. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  32. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  33. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/lance_tools.py +0 -0
  34. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/neo4j/__init__.py +0 -0
  35. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  36. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  37. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/neo4j/system_messages.py +0 -0
  38. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/neo4j/tools.py +0 -0
  39. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  40. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/retriever_agent.py +0 -0
  41. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/__init__.py +0 -0
  42. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  43. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/utils/__init__.py +0 -0
  44. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  45. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  46. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/utils/system_message.py +0 -0
  47. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/sql/utils/tools.py +0 -0
  48. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/special/table_chat_agent.py +0 -0
  49. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/task.py +0 -0
  50. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tool_message.py +0 -0
  51. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/__init__.py +0 -0
  52. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  53. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/file_tools.py +0 -0
  54. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/google_search_tool.py +0 -0
  55. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  56. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/orchestration.py +0 -0
  57. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/recipient_tool.py +0 -0
  58. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/retrieval_tool.py +0 -0
  59. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/rewind_tool.py +0 -0
  60. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/tools/segment_extract_tool.py +0 -0
  61. {langroid-0.37.1 → langroid-0.37.3}/langroid/agent/xml_tool_message.py +0 -0
  62. {langroid-0.37.1 → langroid-0.37.3}/langroid/cachedb/__init__.py +0 -0
  63. {langroid-0.37.1 → langroid-0.37.3}/langroid/cachedb/base.py +0 -0
  64. {langroid-0.37.1 → langroid-0.37.3}/langroid/cachedb/momento_cachedb.py +0 -0
  65. {langroid-0.37.1 → langroid-0.37.3}/langroid/cachedb/redis_cachedb.py +0 -0
  66. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/__init__.py +0 -0
  67. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/base.py +0 -0
  68. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/protoc/__init__.py +0 -0
  69. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  70. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  71. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  72. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  73. {langroid-0.37.1 → langroid-0.37.3}/langroid/embedding_models/remote_embeds.py +0 -0
  74. {langroid-0.37.1 → langroid-0.37.3}/langroid/exceptions.py +0 -0
  75. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/__init__.py +0 -0
  76. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/azure_openai.py +0 -0
  77. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/base.py +0 -0
  78. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/config.py +0 -0
  79. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/mock_lm.py +0 -0
  80. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/openai_gpt.py +0 -0
  81. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  82. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/prompt_formatter/base.py +0 -0
  83. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  84. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  85. {langroid-0.37.1 → langroid-0.37.3}/langroid/language_models/utils.py +0 -0
  86. {langroid-0.37.1 → langroid-0.37.3}/langroid/mytypes.py +0 -0
  87. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/__init__.py +0 -0
  88. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/agent_chats.py +0 -0
  89. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/code_parser.py +0 -0
  90. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/para_sentence_split.py +0 -0
  91. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/parse_json.py +0 -0
  92. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/repo_loader.py +0 -0
  93. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/routing.py +0 -0
  94. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/spider.py +0 -0
  95. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/table_loader.py +0 -0
  96. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/url_loader.py +0 -0
  97. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/urls.py +0 -0
  98. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/utils.py +0 -0
  99. {langroid-0.37.1 → langroid-0.37.3}/langroid/parsing/web_search.py +0 -0
  100. {langroid-0.37.1 → langroid-0.37.3}/langroid/prompts/__init__.py +0 -0
  101. {langroid-0.37.1 → langroid-0.37.3}/langroid/prompts/dialog.py +0 -0
  102. {langroid-0.37.1 → langroid-0.37.3}/langroid/prompts/prompts_config.py +0 -0
  103. {langroid-0.37.1 → langroid-0.37.3}/langroid/prompts/templates.py +0 -0
  104. {langroid-0.37.1 → langroid-0.37.3}/langroid/py.typed +0 -0
  105. {langroid-0.37.1 → langroid-0.37.3}/langroid/pydantic_v1/__init__.py +0 -0
  106. {langroid-0.37.1 → langroid-0.37.3}/langroid/pydantic_v1/main.py +0 -0
  107. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/__init__.py +0 -0
  108. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/algorithms/__init__.py +0 -0
  109. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/algorithms/graph.py +0 -0
  110. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/configuration.py +0 -0
  111. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/constants.py +0 -0
  112. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/git_utils.py +0 -0
  113. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/globals.py +0 -0
  114. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/logging.py +0 -0
  115. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/object_registry.py +0 -0
  116. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/output/__init__.py +0 -0
  117. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/output/citations.py +0 -0
  118. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/output/printing.py +0 -0
  119. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/output/status.py +0 -0
  120. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/pandas_utils.py +0 -0
  121. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/pydantic_utils.py +0 -0
  122. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/system.py +0 -0
  123. {langroid-0.37.1 → langroid-0.37.3}/langroid/utils/types.py +0 -0
  124. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/__init__.py +0 -0
  125. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/base.py +0 -0
  126. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/chromadb.py +0 -0
  127. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/lancedb.py +0 -0
  128. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/meilisearch.py +0 -0
  129. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/momento.py +0 -0
  130. {langroid-0.37.1 → langroid-0.37.3}/langroid/vector_store/qdrantdb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.37.1
3
+ Version: 0.37.3
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -102,7 +102,6 @@ Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
102
102
  Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
103
103
  Provides-Extra: docling
104
104
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
105
- Requires-Dist: pypdf>=5.1.0; extra == 'docling'
106
105
  Provides-Extra: docx
107
106
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
108
107
  Provides-Extra: fastembed
@@ -15,7 +15,6 @@ pip install "langroid[hf-embeddings]"
15
15
  """
16
16
 
17
17
  import logging
18
- import textwrap
19
18
  from collections import OrderedDict
20
19
  from functools import cache
21
20
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, no_type_check
@@ -82,7 +81,7 @@ You will be given various passages from these documents, and asked to answer que
82
81
  about them, or summarize them into coherent answers.
83
82
  """
84
83
 
85
- CHUNK_ENRICHMENT_DELIMITER = "\n<##-##-##>"
84
+ CHUNK_ENRICHMENT_DELIMITER = "\n<##-##-##>\n"
86
85
 
87
86
  has_sentence_transformers = False
88
87
  try:
@@ -805,9 +804,9 @@ class DocChatAgent(ChatAgent):
805
804
  Returns:
806
805
  str: string representation
807
806
  """
808
- contents = [f"Extract: {d.content}" for d in docs]
807
+ contents = [d.content for d in docs]
809
808
  sources = [d.metadata.source for d in docs]
810
- sources = [f"Source: {s}" if s is not None else "" for s in sources]
809
+ sources = [f"SOURCE: {s}" if s is not None else "" for s in sources]
811
810
  return "\n".join(
812
811
  [
813
812
  f"""
@@ -952,12 +951,8 @@ class DocChatAgent(ChatAgent):
952
951
  continue
953
952
 
954
953
  # Combine original content with questions in a structured way
955
- combined_content = textwrap.dedent(
956
- f"""\
957
- {doc.content}
958
- {enrichment_config.delimiter}
959
- {enrichment}
960
- """
954
+ combined_content = (
955
+ f"{doc.content}{enrichment_config.delimiter}{enrichment}"
961
956
  )
962
957
 
963
958
  new_doc = doc.copy(
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
18
18
 
19
19
  class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
20
20
  model_type: str = "openai"
21
- model_name: str = "text-embedding-3-large"
21
+ model_name: str = "text-embedding-3-small"
22
22
  api_key: str = ""
23
23
  api_base: Optional[str] = None
24
24
  organization: str = ""
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
28
28
 
29
29
  class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
30
30
  model_type: str = "azure-openai"
31
- model_name: str = "text-embedding-3-large"
31
+ model_name: str = "text-embedding-3-small"
32
32
  api_key: str = ""
33
33
  api_base: str = ""
34
34
  deployment_name: Optional[str] = None
@@ -2,13 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  import itertools
4
4
  import logging
5
+ import os
5
6
  import re
7
+ import tempfile
6
8
  from enum import Enum
7
9
  from io import BytesIO
10
+ from itertools import accumulate
11
+ from pathlib import Path
8
12
  from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
9
13
 
10
14
  from langroid.exceptions import LangroidImportError
11
- from langroid.parsing.pdf_utils import pdf_split_pages
12
15
  from langroid.utils.object_registry import ObjectRegistry
13
16
 
14
17
  try:
@@ -507,6 +510,8 @@ class DoclingParser(DocumentParser):
507
510
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
508
511
  """
509
512
  Yield each page in the PDF using `docling`.
513
+ Code largely from this example:
514
+ https://github.com/DS4SD/docling/blob/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b/docs/examples/export_figures.py
510
515
 
511
516
  Returns:
512
517
  Generator[docling.Page]: Generator yielding each page.
@@ -516,35 +521,74 @@ class DoclingParser(DocumentParser):
516
521
  "docling", ["docling", "pdf-parsers", "all", "doc-chat"]
517
522
  )
518
523
 
524
+ from docling.datamodel.base_models import InputFormat # type: ignore
525
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
519
526
  from docling.document_converter import ( # type: ignore
520
527
  ConversionResult,
521
528
  DocumentConverter,
529
+ PdfFormatOption,
522
530
  )
523
531
  from docling_core.types.doc import ImageRefMode # type: ignore
524
532
 
525
- page_files, tmp_dir = pdf_split_pages(self.doc_bytes)
526
- converter = DocumentConverter()
527
- for i, page_file in enumerate(page_files):
528
- result: ConversionResult = converter.convert(page_file)
529
- md_text = result.document.export_to_markdown(
530
- image_mode=ImageRefMode.REFERENCED
531
- )
532
- yield i, md_text
533
+ IMAGE_RESOLUTION_SCALE = 2.0
534
+ pipeline_options = PdfPipelineOptions()
535
+ pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
536
+ pipeline_options.generate_page_images = True
537
+ pipeline_options.generate_picture_images = True
533
538
 
534
- tmp_dir.cleanup()
539
+ converter = DocumentConverter(
540
+ format_options={
541
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
542
+ }
543
+ )
544
+ doc_path = self.source
545
+ if doc_path == "bytes":
546
+ # write to tmp file, then use that path
547
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
548
+ temp_file.write(self.doc_bytes.getvalue())
549
+ doc_path = temp_file.name
550
+
551
+ output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
552
+ os.makedirs(output_dir, exist_ok=True)
553
+
554
+ result: ConversionResult = converter.convert(doc_path)
555
+
556
+ def n_page_elements(page) -> int: # type: ignore
557
+ if page.assembled is None:
558
+ return 0
559
+ return 1 + len(page.assembled.elements)
560
+
561
+ page_element_count = [n_page_elements(i) for i in result.pages]
562
+ element_page_cutoff = list(accumulate([1] + page_element_count))
563
+ for i, page in enumerate(result.pages):
564
+ page_start = element_page_cutoff[i]
565
+ page_end = element_page_cutoff[i + 1]
566
+ md_file = output_dir / f"page_{i}.md"
567
+ # we could have just directly exported to a markdown string,
568
+ # but we need to save to a file to force generation of image-files.
569
+ result.document.save_as_markdown(
570
+ md_file,
571
+ image_mode=ImageRefMode.REFERENCED,
572
+ from_element=page_start,
573
+ to_element=page_end,
574
+ )
575
+ yield i, md_file
535
576
 
536
- def get_document_from_page(self, page: str) -> Document:
577
+ def get_document_from_page(self, md_file: str) -> Document:
537
578
  """
538
- Get Document object from a given `docling` "page" (actually a chunk).
579
+ Get Document object from a given 1-page markdown file,
580
+ possibly containing image refs.
539
581
 
540
582
  Args:
541
- page (docling.chunking.DocChunk): The `docling` chunk
583
+ md_file (str): The markdown file path for the page.
542
584
 
543
585
  Returns:
544
586
  Document: Document object, with content and possible metadata.
545
587
  """
588
+ with open(md_file, "r") as f:
589
+ text = f.read()
546
590
  return Document(
547
- content=self.fix_text(page),
591
+ content=self.fix_text(text),
548
592
  metadata=DocMetaData(source=self.source),
549
593
  )
550
594
 
@@ -51,7 +51,7 @@ class ParsingConfig(BaseSettings):
51
51
  n_similar_docs: int = 4
52
52
  n_neighbor_ids: int = 5 # window size to store around each chunk
53
53
  separators: List[str] = ["\n\n", "\n", " ", ""]
54
- token_encoding_model: str = "text-embedding-3-large"
54
+ token_encoding_model: str = "text-embedding-3-small"
55
55
  pdf: PdfParsingConfig = PdfParsingConfig()
56
56
  docx: DocxParsingConfig = DocxParsingConfig()
57
57
  doc: DocParsingConfig = DocParsingConfig()
@@ -5,27 +5,24 @@ from tempfile import TemporaryDirectory
5
5
  from typing import TYPE_CHECKING, Any, BinaryIO, List, Tuple, Union
6
6
 
7
7
  try:
8
- import pypdf
8
+ import fitz
9
9
  except ImportError:
10
10
  if not TYPE_CHECKING:
11
- pypdf = None
11
+ fitz = None
12
12
 
13
13
  from langroid.exceptions import LangroidImportError
14
14
 
15
- if pypdf is None:
16
- raise LangroidImportError(
17
- "pypdf", ["pypdf", "docling", "all", "pdf-parsers", "doc-chat"]
18
- )
19
- from pypdf import PdfReader, PdfWriter
15
+ if fitz is None:
16
+ raise LangroidImportError("fitz", ["pymupdf", "all", "pdf-parsers", "doc-chat"])
20
17
 
21
18
 
22
19
  def pdf_split_pages(
23
- input_pdf: Union[str, Path, BytesIO, BinaryIO],
20
+ input_pdf: Union[BytesIO, BinaryIO],
24
21
  ) -> Tuple[List[Path], TemporaryDirectory[Any]]:
25
22
  """Splits a PDF into individual pages in a temporary directory.
26
23
 
27
24
  Args:
28
- input_pdf: Input PDF file path or file-like object
25
+ input_pdf: Input PDF file in bytes or binary mode
29
26
  max_workers: Maximum number of concurrent workers for parallel processing
30
27
 
31
28
  Returns:
@@ -39,17 +36,16 @@ def pdf_split_pages(
39
36
  tmp_dir.cleanup() # Clean up temp files when done
40
37
  """
41
38
  tmp_dir = tempfile.TemporaryDirectory()
42
- reader = PdfReader(input_pdf)
39
+ doc = fitz.open(stream=input_pdf, filetype="pdf")
43
40
  paths = []
44
41
 
45
- for i in range(len(reader.pages)):
46
- writer = PdfWriter()
47
- writer.add_page(reader.pages[i])
48
- writer.add_metadata(reader.metadata or {})
49
-
50
- output = Path(tmp_dir.name) / f"page_{i+1}.pdf"
51
- with open(output, "wb") as f:
52
- writer.write(f)
42
+ for page_num in range(len(doc)):
43
+ new_doc = fitz.open()
44
+ new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
45
+ output = Path(tmp_dir.name) / f"page_{page_num + 1}.pdf"
46
+ new_doc.save(str(output))
47
+ new_doc.close()
53
48
  paths.append(output)
54
49
 
55
- return paths, tmp_dir # Return dir object so caller can control cleanup
50
+ doc.close()
51
+ return paths, tmp_dir
@@ -7,6 +7,7 @@ See tests for examples: tests/main/test_string_search.py
7
7
  """
8
8
 
9
9
  import difflib
10
+ import re
10
11
  from typing import List, Tuple
11
12
 
12
13
  from nltk.corpus import stopwords
@@ -195,8 +196,10 @@ def get_context(
195
196
 
196
197
  Returns:
197
198
  str: A string containing b words before, the match, and a words after
198
- the best approximate match position of the query in the text. If no
199
- match is found, returns empty string.
199
+ the best approximate match position of the query in the text.
200
+ The text is extracted from the original `text`, preserving formatting,
201
+ whitespace, etc, so it does not disturb any downstream processing.
202
+ If no match is found, returns empty string.
200
203
  int: The start position of the match in the text.
201
204
  int: The end position of the match in the text.
202
205
 
@@ -204,6 +207,8 @@ def get_context(
204
207
  >>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
205
208
  # 'fox jumps over the apple.'
206
209
  """
210
+
211
+ # If no word limits specified, return full text
207
212
  if words_after is None and words_before is None:
208
213
  # return entire text since we're not asked to return a bounded context
209
214
  return text, 0, 0
@@ -212,23 +217,35 @@ def get_context(
212
217
  if fuzz.partial_ratio(query, text) < 40:
213
218
  return "", 0, 0
214
219
 
220
+ # Find best matching position of query in text
215
221
  sequence_matcher = difflib.SequenceMatcher(None, text, query)
216
222
  match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
217
223
 
218
224
  if match.size == 0:
219
225
  return "", 0, 0
220
226
 
227
+ # Count words before match point
221
228
  segments = text.split()
222
229
  n_segs = len(segments)
223
-
224
230
  start_segment_pos = len(text[: match.a].split())
225
231
 
232
+ # Calculate word window boundaries
226
233
  words_before = words_before or n_segs
227
234
  words_after = words_after or n_segs
228
235
  start_pos = max(0, start_segment_pos - words_before)
229
236
  end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
230
237
 
231
- return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
238
+ # Find character positions where words start
239
+ word_positions = [m.start() for m in re.finditer(r"\S+", text)]
240
+
241
+ # Convert word positions to character positions
242
+ start_char = word_positions[start_pos] if start_pos < len(word_positions) else 0
243
+ end_char = word_positions[min(end_pos, len(word_positions) - 1)] + len(
244
+ text.split()[min(end_pos - 1, len(word_positions) - 1)]
245
+ )
246
+
247
+ # return exact substring with original formatting
248
+ return text[start_char:end_char], start_pos, end_pos
232
249
 
233
250
 
234
251
  def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:
@@ -211,10 +211,10 @@ class WeaviateDB(VectorStore):
211
211
  return_properties=True,
212
212
  return_metadata=MetadataQuery(distance=True),
213
213
  )
214
- return [
215
- (self.weaviate_obj_to_doc(item), 1 - (item.metadata.distance or 1))
216
- for item in response.objects
217
- ]
214
+ maybe_distances = [item.metadata.distance for item in response.objects]
215
+ similarities = [0 if d is None else 1 - d for d in maybe_distances]
216
+ docs = [self.weaviate_obj_to_doc(item) for item in response.objects]
217
+ return list(zip(docs, similarities))
218
218
 
219
219
  def _create_valid_uuid_id(self, id: str) -> Any:
220
220
  try:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "langroid"
3
- version = "0.37.1"
3
+ version = "0.37.3"
4
4
  authors = [
5
5
  {name = "Prasad Chalasani", email = "pchalasani@gmail.com"},
6
6
  ]
@@ -128,7 +128,6 @@ lancedb = [
128
128
 
129
129
  docling = [
130
130
  "docling<3.0.0,>=2.16.0",
131
- "pypdf>=5.1.0", # needed to split pdf into pages, then use docling
132
131
  ]
133
132
 
134
133
  pymupdf4llm = [
File without changes
File without changes
File without changes
File without changes
File without changes