langroid 0.37.0__tar.gz → 0.37.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. {langroid-0.37.0 → langroid-0.37.2}/PKG-INFO +1 -1
  2. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/doc_chat_agent.py +5 -10
  3. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/models.py +2 -2
  4. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/document_parser.py +61 -23
  5. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/parser.py +1 -1
  6. langroid-0.37.2/langroid/parsing/pdf_utils.py +51 -0
  7. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/search.py +21 -4
  8. {langroid-0.37.0 → langroid-0.37.2}/pyproject.toml +1 -1
  9. {langroid-0.37.0 → langroid-0.37.2}/.gitignore +0 -0
  10. {langroid-0.37.0 → langroid-0.37.2}/LICENSE +0 -0
  11. {langroid-0.37.0 → langroid-0.37.2}/README.md +0 -0
  12. {langroid-0.37.0 → langroid-0.37.2}/langroid/__init__.py +0 -0
  13. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/__init__.py +0 -0
  14. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/base.py +0 -0
  15. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/batch.py +0 -0
  16. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/callbacks/__init__.py +0 -0
  17. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/callbacks/chainlit.py +0 -0
  18. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/chat_agent.py +0 -0
  19. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/chat_document.py +0 -0
  20. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/openai_assistant.py +0 -0
  21. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/__init__.py +0 -0
  22. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/__init__.py +0 -0
  23. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  24. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/system_messages.py +0 -0
  25. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/tools.py +0 -0
  26. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/utils.py +0 -0
  27. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  28. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/__init__.py +0 -0
  29. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  30. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  31. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  32. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_tools.py +0 -0
  33. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/__init__.py +0 -0
  34. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  35. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  36. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/system_messages.py +0 -0
  37. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/tools.py +0 -0
  38. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  39. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/retriever_agent.py +0 -0
  40. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/__init__.py +0 -0
  41. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  42. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/__init__.py +0 -0
  43. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  44. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  45. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/system_message.py +0 -0
  46. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/tools.py +0 -0
  47. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/table_chat_agent.py +0 -0
  48. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/task.py +0 -0
  49. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tool_message.py +0 -0
  50. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/__init__.py +0 -0
  51. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  52. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/file_tools.py +0 -0
  53. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/google_search_tool.py +0 -0
  54. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  55. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/orchestration.py +0 -0
  56. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/recipient_tool.py +0 -0
  57. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/retrieval_tool.py +0 -0
  58. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/rewind_tool.py +0 -0
  59. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/segment_extract_tool.py +0 -0
  60. {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/xml_tool_message.py +0 -0
  61. {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/__init__.py +0 -0
  62. {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/base.py +0 -0
  63. {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/momento_cachedb.py +0 -0
  64. {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/redis_cachedb.py +0 -0
  65. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/__init__.py +0 -0
  66. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/base.py +0 -0
  67. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/__init__.py +0 -0
  68. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  69. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  70. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  71. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  72. {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/remote_embeds.py +0 -0
  73. {langroid-0.37.0 → langroid-0.37.2}/langroid/exceptions.py +0 -0
  74. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/__init__.py +0 -0
  75. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/azure_openai.py +0 -0
  76. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/base.py +0 -0
  77. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/config.py +0 -0
  78. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/mock_lm.py +0 -0
  79. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/openai_gpt.py +0 -0
  80. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  81. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/base.py +0 -0
  82. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  83. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  84. {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/utils.py +0 -0
  85. {langroid-0.37.0 → langroid-0.37.2}/langroid/mytypes.py +0 -0
  86. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/__init__.py +0 -0
  87. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/agent_chats.py +0 -0
  88. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/code_parser.py +0 -0
  89. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/para_sentence_split.py +0 -0
  90. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/parse_json.py +0 -0
  91. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/repo_loader.py +0 -0
  92. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/routing.py +0 -0
  93. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/spider.py +0 -0
  94. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/table_loader.py +0 -0
  95. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/url_loader.py +0 -0
  96. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/urls.py +0 -0
  97. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/utils.py +0 -0
  98. {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/web_search.py +0 -0
  99. {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/__init__.py +0 -0
  100. {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/dialog.py +0 -0
  101. {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/prompts_config.py +0 -0
  102. {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/templates.py +0 -0
  103. {langroid-0.37.0 → langroid-0.37.2}/langroid/py.typed +0 -0
  104. {langroid-0.37.0 → langroid-0.37.2}/langroid/pydantic_v1/__init__.py +0 -0
  105. {langroid-0.37.0 → langroid-0.37.2}/langroid/pydantic_v1/main.py +0 -0
  106. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/__init__.py +0 -0
  107. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/algorithms/__init__.py +0 -0
  108. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/algorithms/graph.py +0 -0
  109. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/configuration.py +0 -0
  110. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/constants.py +0 -0
  111. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/git_utils.py +0 -0
  112. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/globals.py +0 -0
  113. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/logging.py +0 -0
  114. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/object_registry.py +0 -0
  115. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/__init__.py +0 -0
  116. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/citations.py +0 -0
  117. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/printing.py +0 -0
  118. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/status.py +0 -0
  119. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/pandas_utils.py +0 -0
  120. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/pydantic_utils.py +0 -0
  121. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/system.py +0 -0
  122. {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/types.py +0 -0
  123. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/__init__.py +0 -0
  124. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/base.py +0 -0
  125. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/chromadb.py +0 -0
  126. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/lancedb.py +0 -0
  127. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/meilisearch.py +0 -0
  128. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/momento.py +0 -0
  129. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/qdrantdb.py +0 -0
  130. {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/weaviatedb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.37.0
3
+ Version: 0.37.2
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -15,7 +15,6 @@ pip install "langroid[hf-embeddings]"
15
15
  """
16
16
 
17
17
  import logging
18
- import textwrap
19
18
  from collections import OrderedDict
20
19
  from functools import cache
21
20
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, no_type_check
@@ -82,7 +81,7 @@ You will be given various passages from these documents, and asked to answer que
82
81
  about them, or summarize them into coherent answers.
83
82
  """
84
83
 
85
- CHUNK_ENRICHMENT_DELIMITER = "\n<##-##-##>"
84
+ CHUNK_ENRICHMENT_DELIMITER = "\n<##-##-##>\n"
86
85
 
87
86
  has_sentence_transformers = False
88
87
  try:
@@ -805,9 +804,9 @@ class DocChatAgent(ChatAgent):
805
804
  Returns:
806
805
  str: string representation
807
806
  """
808
- contents = [f"Extract: {d.content}" for d in docs]
807
+ contents = [d.content for d in docs]
809
808
  sources = [d.metadata.source for d in docs]
810
- sources = [f"Source: {s}" if s is not None else "" for s in sources]
809
+ sources = [f"SOURCE: {s}" if s is not None else "" for s in sources]
811
810
  return "\n".join(
812
811
  [
813
812
  f"""
@@ -952,12 +951,8 @@ class DocChatAgent(ChatAgent):
952
951
  continue
953
952
 
954
953
  # Combine original content with questions in a structured way
955
- combined_content = textwrap.dedent(
956
- f"""\
957
- {doc.content}
958
- {enrichment_config.delimiter}
959
- {enrichment}
960
- """
954
+ combined_content = (
955
+ f"{doc.content}{enrichment_config.delimiter}{enrichment}"
961
956
  )
962
957
 
963
958
  new_doc = doc.copy(
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
18
18
 
19
19
  class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
20
20
  model_type: str = "openai"
21
- model_name: str = "text-embedding-3-large"
21
+ model_name: str = "text-embedding-3-small"
22
22
  api_key: str = ""
23
23
  api_base: Optional[str] = None
24
24
  organization: str = ""
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
28
28
 
29
29
  class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
30
30
  model_type: str = "azure-openai"
31
- model_name: str = "text-embedding-3-large"
31
+ model_name: str = "text-embedding-3-small"
32
32
  api_key: str = ""
33
33
  api_base: str = ""
34
34
  deployment_name: Optional[str] = None
@@ -2,10 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  import itertools
4
4
  import logging
5
+ import os
5
6
  import re
6
7
  import tempfile
7
8
  from enum import Enum
8
9
  from io import BytesIO
10
+ from itertools import accumulate
11
+ from pathlib import Path
9
12
  from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
10
13
 
11
14
  from langroid.exceptions import LangroidImportError
@@ -507,6 +510,8 @@ class DoclingParser(DocumentParser):
507
510
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
508
511
  """
509
512
  Yield each page in the PDF using `docling`.
513
+ Code largely from this example:
514
+ https://github.com/DS4SD/docling/blob/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b/docs/examples/export_figures.py
510
515
 
511
516
  Returns:
512
517
  Generator[docling.Page]: Generator yielding each page.
@@ -515,42 +520,75 @@ class DoclingParser(DocumentParser):
515
520
  raise LangroidImportError(
516
521
  "docling", ["docling", "pdf-parsers", "all", "doc-chat"]
517
522
  )
518
- from docling.datamodel.document import TextItem # type: ignore
523
+
524
+ from docling.datamodel.base_models import InputFormat # type: ignore
525
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
519
526
  from docling.document_converter import ( # type: ignore
520
527
  ConversionResult,
521
528
  DocumentConverter,
529
+ PdfFormatOption,
522
530
  )
531
+ from docling_core.types.doc import ImageRefMode # type: ignore
532
+
533
+ IMAGE_RESOLUTION_SCALE = 2.0
534
+ pipeline_options = PdfPipelineOptions()
535
+ pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
536
+ pipeline_options.generate_page_images = True
537
+ pipeline_options.generate_picture_images = True
538
+
539
+ converter = DocumentConverter(
540
+ format_options={
541
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
542
+ }
543
+ )
544
+ doc_path = self.source
545
+ if doc_path == "bytes":
546
+ # write to tmp file, then use that path
547
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
548
+ temp_file.write(self.doc_bytes.getvalue())
549
+ doc_path = temp_file.name
550
+
551
+ output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
552
+ os.makedirs(output_dir, exist_ok=True)
553
+
554
+ result: ConversionResult = converter.convert(doc_path)
555
+
556
+ def n_page_elements(page) -> int: # type: ignore
557
+ if page.assembled is None:
558
+ return 0
559
+ return 1 + len(page.assembled.elements)
560
+
561
+ page_element_count = [n_page_elements(i) for i in result.pages]
562
+ element_page_cutoff = list(accumulate([1] + page_element_count))
563
+ for i, page in enumerate(result.pages):
564
+ page_start = element_page_cutoff[i]
565
+ page_end = element_page_cutoff[i + 1]
566
+ md_file = output_dir / f"page_{i}.md"
567
+ # we could have just directly exported to a markdown string,
568
+ # but we need to save to a file to force generation of image-files.
569
+ result.document.save_as_markdown(
570
+ md_file,
571
+ image_mode=ImageRefMode.REFERENCED,
572
+ from_element=page_start,
573
+ to_element=page_end,
574
+ )
575
+ yield i, md_file
523
576
 
524
- converter = DocumentConverter()
525
- file_path = self.source
526
- if file_path == "bytes":
527
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
528
- tmp.write(self.doc_bytes.getvalue())
529
- file_path = tmp.name
530
- result: ConversionResult = converter.convert(file_path)
531
- doc = result.document
532
- n_pages = doc.num_pages() # type: ignore
533
- for i in range(n_pages):
534
- texts = [
535
- item[0].text
536
- for item in doc.iterate_items(page_no=i + 1)
537
- if isinstance(item[0], TextItem)
538
- ]
539
- text = "\n".join(texts)
540
- yield i, text
541
-
542
- def get_document_from_page(self, page: str) -> Document:
577
+ def get_document_from_page(self, md_file: str) -> Document:
543
578
  """
544
- Get Document object from a given `docling` "page" (actually a chunk).
579
+ Get Document object from a given 1-page markdown file,
580
+ possibly containing image refs.
545
581
 
546
582
  Args:
547
- page (docling.chunking.DocChunk): The `docling` chunk
583
+ md_file (str): The markdown file path for the page.
548
584
 
549
585
  Returns:
550
586
  Document: Document object, with content and possible metadata.
551
587
  """
588
+ with open(md_file, "r") as f:
589
+ text = f.read()
552
590
  return Document(
553
- content=self.fix_text(page),
591
+ content=self.fix_text(text),
554
592
  metadata=DocMetaData(source=self.source),
555
593
  )
556
594
 
@@ -51,7 +51,7 @@ class ParsingConfig(BaseSettings):
51
51
  n_similar_docs: int = 4
52
52
  n_neighbor_ids: int = 5 # window size to store around each chunk
53
53
  separators: List[str] = ["\n\n", "\n", " ", ""]
54
- token_encoding_model: str = "text-embedding-3-large"
54
+ token_encoding_model: str = "text-embedding-3-small"
55
55
  pdf: PdfParsingConfig = PdfParsingConfig()
56
56
  docx: DocxParsingConfig = DocxParsingConfig()
57
57
  doc: DocParsingConfig = DocParsingConfig()
@@ -0,0 +1,51 @@
1
+ import tempfile
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from tempfile import TemporaryDirectory
5
+ from typing import TYPE_CHECKING, Any, BinaryIO, List, Tuple, Union
6
+
7
+ try:
8
+ import fitz
9
+ except ImportError:
10
+ if not TYPE_CHECKING:
11
+ fitz = None
12
+
13
+ from langroid.exceptions import LangroidImportError
14
+
15
+ if fitz is None:
16
+ raise LangroidImportError("fitz", ["pymupdf", "all", "pdf-parsers", "doc-chat"])
17
+
18
+
19
+ def pdf_split_pages(
20
+ input_pdf: Union[BytesIO, BinaryIO],
21
+ ) -> Tuple[List[Path], TemporaryDirectory[Any]]:
22
+ """Splits a PDF into individual pages in a temporary directory.
23
+
24
+ Args:
25
+ input_pdf: Input PDF file in bytes or binary mode
26
+ max_workers: Maximum number of concurrent workers for parallel processing
27
+
28
+ Returns:
29
+ Tuple containing:
30
+ - List of paths to individual PDF pages
31
+ - Temporary directory object (caller must call cleanup())
32
+
33
+ Example:
34
+ paths, tmp_dir = split_pdf_temp("input.pdf")
35
+ # Use paths...
36
+ tmp_dir.cleanup() # Clean up temp files when done
37
+ """
38
+ tmp_dir = tempfile.TemporaryDirectory()
39
+ doc = fitz.open(stream=input_pdf, filetype="pdf")
40
+ paths = []
41
+
42
+ for page_num in range(len(doc)):
43
+ new_doc = fitz.open()
44
+ new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
45
+ output = Path(tmp_dir.name) / f"page_{page_num + 1}.pdf"
46
+ new_doc.save(str(output))
47
+ new_doc.close()
48
+ paths.append(output)
49
+
50
+ doc.close()
51
+ return paths, tmp_dir
@@ -7,6 +7,7 @@ See tests for examples: tests/main/test_string_search.py
7
7
  """
8
8
 
9
9
  import difflib
10
+ import re
10
11
  from typing import List, Tuple
11
12
 
12
13
  from nltk.corpus import stopwords
@@ -195,8 +196,10 @@ def get_context(
195
196
 
196
197
  Returns:
197
198
  str: A string containing b words before, the match, and a words after
198
- the best approximate match position of the query in the text. If no
199
- match is found, returns empty string.
199
+ the best approximate match position of the query in the text.
200
+ The text is extracted from the original `text`, preserving formatting,
201
+ whitespace, etc, so it does not disturb any downstream processing.
202
+ If no match is found, returns empty string.
200
203
  int: The start position of the match in the text.
201
204
  int: The end position of the match in the text.
202
205
 
@@ -204,6 +207,8 @@ def get_context(
204
207
  >>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
205
208
  # 'fox jumps over the apple.'
206
209
  """
210
+
211
+ # If no word limits specified, return full text
207
212
  if words_after is None and words_before is None:
208
213
  # return entire text since we're not asked to return a bounded context
209
214
  return text, 0, 0
@@ -212,23 +217,35 @@ def get_context(
212
217
  if fuzz.partial_ratio(query, text) < 40:
213
218
  return "", 0, 0
214
219
 
220
+ # Find best matching position of query in text
215
221
  sequence_matcher = difflib.SequenceMatcher(None, text, query)
216
222
  match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
217
223
 
218
224
  if match.size == 0:
219
225
  return "", 0, 0
220
226
 
227
+ # Count words before match point
221
228
  segments = text.split()
222
229
  n_segs = len(segments)
223
-
224
230
  start_segment_pos = len(text[: match.a].split())
225
231
 
232
+ # Calculate word window boundaries
226
233
  words_before = words_before or n_segs
227
234
  words_after = words_after or n_segs
228
235
  start_pos = max(0, start_segment_pos - words_before)
229
236
  end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
230
237
 
231
- return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
238
+ # Find character positions where words start
239
+ word_positions = [m.start() for m in re.finditer(r"\S+", text)]
240
+
241
+ # Convert word positions to character positions
242
+ start_char = word_positions[start_pos] if start_pos < len(word_positions) else 0
243
+ end_char = word_positions[min(end_pos, len(word_positions) - 1)] + len(
244
+ text.split()[min(end_pos - 1, len(word_positions) - 1)]
245
+ )
246
+
247
+ # return exact substring with original formatting
248
+ return text[start_char:end_char], start_pos, end_pos
232
249
 
233
250
 
234
251
  def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "langroid"
3
- version = "0.37.0"
3
+ version = "0.37.2"
4
4
  authors = [
5
5
  {name = "Prasad Chalasani", email = "pchalasani@gmail.com"},
6
6
  ]
File without changes
File without changes
File without changes
File without changes
File without changes