langroid 0.37.0__tar.gz → 0.37.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.37.0 → langroid-0.37.2}/PKG-INFO +1 -1
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/doc_chat_agent.py +5 -10
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/models.py +2 -2
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/document_parser.py +61 -23
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/parser.py +1 -1
- langroid-0.37.2/langroid/parsing/pdf_utils.py +51 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/search.py +21 -4
- {langroid-0.37.0 → langroid-0.37.2}/pyproject.toml +1 -1
- {langroid-0.37.0 → langroid-0.37.2}/.gitignore +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/LICENSE +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/README.md +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/base.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/batch.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/callbacks/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/callbacks/chainlit.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/chat_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/chat_document.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/openai_assistant.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/system_messages.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/tools.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/arangodb/utils.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/lance_tools.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/system_messages.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/neo4j/tools.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/relevance_extractor_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/retriever_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/system_message.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/tools.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/table_chat_agent.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/task.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tool_message.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/file_tools.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/google_search_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/metaphor_search_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/orchestration.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/recipient_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/retrieval_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/rewind_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/tools/segment_extract_tool.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/agent/xml_tool_message.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/base.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/momento_cachedb.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/cachedb/redis_cachedb.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/base.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings.proto +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/embedding_models/remote_embeds.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/exceptions.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/azure_openai.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/base.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/config.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/mock_lm.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/openai_gpt.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/base.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/utils.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/mytypes.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/agent_chats.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/code_parser.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/para_sentence_split.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/parse_json.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/repo_loader.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/routing.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/spider.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/table_loader.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/url_loader.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/urls.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/utils.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/parsing/web_search.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/dialog.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/prompts_config.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/prompts/templates.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/py.typed +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/pydantic_v1/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/pydantic_v1/main.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/algorithms/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/algorithms/graph.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/configuration.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/constants.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/git_utils.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/globals.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/logging.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/object_registry.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/citations.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/printing.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/output/status.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/pandas_utils.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/pydantic_utils.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/system.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/utils/types.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/__init__.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/base.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/chromadb.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/lancedb.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/meilisearch.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/momento.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/qdrantdb.py +0 -0
- {langroid-0.37.0 → langroid-0.37.2}/langroid/vector_store/weaviatedb.py +0 -0
@@ -15,7 +15,6 @@ pip install "langroid[hf-embeddings]"
|
|
15
15
|
"""
|
16
16
|
|
17
17
|
import logging
|
18
|
-
import textwrap
|
19
18
|
from collections import OrderedDict
|
20
19
|
from functools import cache
|
21
20
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, no_type_check
|
@@ -82,7 +81,7 @@ You will be given various passages from these documents, and asked to answer que
|
|
82
81
|
about them, or summarize them into coherent answers.
|
83
82
|
"""
|
84
83
|
|
85
|
-
CHUNK_ENRICHMENT_DELIMITER = "\n
|
84
|
+
CHUNK_ENRICHMENT_DELIMITER = "\n<##-##-##>\n"
|
86
85
|
|
87
86
|
has_sentence_transformers = False
|
88
87
|
try:
|
@@ -805,9 +804,9 @@ class DocChatAgent(ChatAgent):
|
|
805
804
|
Returns:
|
806
805
|
str: string representation
|
807
806
|
"""
|
808
|
-
contents = [
|
807
|
+
contents = [d.content for d in docs]
|
809
808
|
sources = [d.metadata.source for d in docs]
|
810
|
-
sources = [f"
|
809
|
+
sources = [f"SOURCE: {s}" if s is not None else "" for s in sources]
|
811
810
|
return "\n".join(
|
812
811
|
[
|
813
812
|
f"""
|
@@ -952,12 +951,8 @@ class DocChatAgent(ChatAgent):
|
|
952
951
|
continue
|
953
952
|
|
954
953
|
# Combine original content with questions in a structured way
|
955
|
-
combined_content =
|
956
|
-
f""
|
957
|
-
{doc.content}
|
958
|
-
{enrichment_config.delimiter}
|
959
|
-
{enrichment}
|
960
|
-
"""
|
954
|
+
combined_content = (
|
955
|
+
f"{doc.content}{enrichment_config.delimiter}{enrichment}"
|
961
956
|
)
|
962
957
|
|
963
958
|
new_doc = doc.copy(
|
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
|
|
18
18
|
|
19
19
|
class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
20
20
|
model_type: str = "openai"
|
21
|
-
model_name: str = "text-embedding-3-
|
21
|
+
model_name: str = "text-embedding-3-small"
|
22
22
|
api_key: str = ""
|
23
23
|
api_base: Optional[str] = None
|
24
24
|
organization: str = ""
|
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
|
28
28
|
|
29
29
|
class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
|
30
30
|
model_type: str = "azure-openai"
|
31
|
-
model_name: str = "text-embedding-3-
|
31
|
+
model_name: str = "text-embedding-3-small"
|
32
32
|
api_key: str = ""
|
33
33
|
api_base: str = ""
|
34
34
|
deployment_name: Optional[str] = None
|
@@ -2,10 +2,13 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import itertools
|
4
4
|
import logging
|
5
|
+
import os
|
5
6
|
import re
|
6
7
|
import tempfile
|
7
8
|
from enum import Enum
|
8
9
|
from io import BytesIO
|
10
|
+
from itertools import accumulate
|
11
|
+
from pathlib import Path
|
9
12
|
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
|
10
13
|
|
11
14
|
from langroid.exceptions import LangroidImportError
|
@@ -507,6 +510,8 @@ class DoclingParser(DocumentParser):
|
|
507
510
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
508
511
|
"""
|
509
512
|
Yield each page in the PDF using `docling`.
|
513
|
+
Code largely from this example:
|
514
|
+
https://github.com/DS4SD/docling/blob/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b/docs/examples/export_figures.py
|
510
515
|
|
511
516
|
Returns:
|
512
517
|
Generator[docling.Page]: Generator yielding each page.
|
@@ -515,42 +520,75 @@ class DoclingParser(DocumentParser):
|
|
515
520
|
raise LangroidImportError(
|
516
521
|
"docling", ["docling", "pdf-parsers", "all", "doc-chat"]
|
517
522
|
)
|
518
|
-
|
523
|
+
|
524
|
+
from docling.datamodel.base_models import InputFormat # type: ignore
|
525
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
519
526
|
from docling.document_converter import ( # type: ignore
|
520
527
|
ConversionResult,
|
521
528
|
DocumentConverter,
|
529
|
+
PdfFormatOption,
|
522
530
|
)
|
531
|
+
from docling_core.types.doc import ImageRefMode # type: ignore
|
532
|
+
|
533
|
+
IMAGE_RESOLUTION_SCALE = 2.0
|
534
|
+
pipeline_options = PdfPipelineOptions()
|
535
|
+
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
536
|
+
pipeline_options.generate_page_images = True
|
537
|
+
pipeline_options.generate_picture_images = True
|
538
|
+
|
539
|
+
converter = DocumentConverter(
|
540
|
+
format_options={
|
541
|
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
542
|
+
}
|
543
|
+
)
|
544
|
+
doc_path = self.source
|
545
|
+
if doc_path == "bytes":
|
546
|
+
# write to tmp file, then use that path
|
547
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
548
|
+
temp_file.write(self.doc_bytes.getvalue())
|
549
|
+
doc_path = temp_file.name
|
550
|
+
|
551
|
+
output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
|
552
|
+
os.makedirs(output_dir, exist_ok=True)
|
553
|
+
|
554
|
+
result: ConversionResult = converter.convert(doc_path)
|
555
|
+
|
556
|
+
def n_page_elements(page) -> int: # type: ignore
|
557
|
+
if page.assembled is None:
|
558
|
+
return 0
|
559
|
+
return 1 + len(page.assembled.elements)
|
560
|
+
|
561
|
+
page_element_count = [n_page_elements(i) for i in result.pages]
|
562
|
+
element_page_cutoff = list(accumulate([1] + page_element_count))
|
563
|
+
for i, page in enumerate(result.pages):
|
564
|
+
page_start = element_page_cutoff[i]
|
565
|
+
page_end = element_page_cutoff[i + 1]
|
566
|
+
md_file = output_dir / f"page_{i}.md"
|
567
|
+
# we could have just directly exported to a markdown string,
|
568
|
+
# but we need to save to a file to force generation of image-files.
|
569
|
+
result.document.save_as_markdown(
|
570
|
+
md_file,
|
571
|
+
image_mode=ImageRefMode.REFERENCED,
|
572
|
+
from_element=page_start,
|
573
|
+
to_element=page_end,
|
574
|
+
)
|
575
|
+
yield i, md_file
|
523
576
|
|
524
|
-
|
525
|
-
file_path = self.source
|
526
|
-
if file_path == "bytes":
|
527
|
-
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
528
|
-
tmp.write(self.doc_bytes.getvalue())
|
529
|
-
file_path = tmp.name
|
530
|
-
result: ConversionResult = converter.convert(file_path)
|
531
|
-
doc = result.document
|
532
|
-
n_pages = doc.num_pages() # type: ignore
|
533
|
-
for i in range(n_pages):
|
534
|
-
texts = [
|
535
|
-
item[0].text
|
536
|
-
for item in doc.iterate_items(page_no=i + 1)
|
537
|
-
if isinstance(item[0], TextItem)
|
538
|
-
]
|
539
|
-
text = "\n".join(texts)
|
540
|
-
yield i, text
|
541
|
-
|
542
|
-
def get_document_from_page(self, page: str) -> Document:
|
577
|
+
def get_document_from_page(self, md_file: str) -> Document:
|
543
578
|
"""
|
544
|
-
Get Document object from a given
|
579
|
+
Get Document object from a given 1-page markdown file,
|
580
|
+
possibly containing image refs.
|
545
581
|
|
546
582
|
Args:
|
547
|
-
|
583
|
+
md_file (str): The markdown file path for the page.
|
548
584
|
|
549
585
|
Returns:
|
550
586
|
Document: Document object, with content and possible metadata.
|
551
587
|
"""
|
588
|
+
with open(md_file, "r") as f:
|
589
|
+
text = f.read()
|
552
590
|
return Document(
|
553
|
-
content=self.fix_text(
|
591
|
+
content=self.fix_text(text),
|
554
592
|
metadata=DocMetaData(source=self.source),
|
555
593
|
)
|
556
594
|
|
@@ -51,7 +51,7 @@ class ParsingConfig(BaseSettings):
|
|
51
51
|
n_similar_docs: int = 4
|
52
52
|
n_neighbor_ids: int = 5 # window size to store around each chunk
|
53
53
|
separators: List[str] = ["\n\n", "\n", " ", ""]
|
54
|
-
token_encoding_model: str = "text-embedding-3-
|
54
|
+
token_encoding_model: str = "text-embedding-3-small"
|
55
55
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
56
56
|
docx: DocxParsingConfig = DocxParsingConfig()
|
57
57
|
doc: DocParsingConfig = DocParsingConfig()
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import tempfile
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from tempfile import TemporaryDirectory
|
5
|
+
from typing import TYPE_CHECKING, Any, BinaryIO, List, Tuple, Union
|
6
|
+
|
7
|
+
try:
|
8
|
+
import fitz
|
9
|
+
except ImportError:
|
10
|
+
if not TYPE_CHECKING:
|
11
|
+
fitz = None
|
12
|
+
|
13
|
+
from langroid.exceptions import LangroidImportError
|
14
|
+
|
15
|
+
if fitz is None:
|
16
|
+
raise LangroidImportError("fitz", ["pymupdf", "all", "pdf-parsers", "doc-chat"])
|
17
|
+
|
18
|
+
|
19
|
+
def pdf_split_pages(
|
20
|
+
input_pdf: Union[BytesIO, BinaryIO],
|
21
|
+
) -> Tuple[List[Path], TemporaryDirectory[Any]]:
|
22
|
+
"""Splits a PDF into individual pages in a temporary directory.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
input_pdf: Input PDF file in bytes or binary mode
|
26
|
+
max_workers: Maximum number of concurrent workers for parallel processing
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
Tuple containing:
|
30
|
+
- List of paths to individual PDF pages
|
31
|
+
- Temporary directory object (caller must call cleanup())
|
32
|
+
|
33
|
+
Example:
|
34
|
+
paths, tmp_dir = split_pdf_temp("input.pdf")
|
35
|
+
# Use paths...
|
36
|
+
tmp_dir.cleanup() # Clean up temp files when done
|
37
|
+
"""
|
38
|
+
tmp_dir = tempfile.TemporaryDirectory()
|
39
|
+
doc = fitz.open(stream=input_pdf, filetype="pdf")
|
40
|
+
paths = []
|
41
|
+
|
42
|
+
for page_num in range(len(doc)):
|
43
|
+
new_doc = fitz.open()
|
44
|
+
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
45
|
+
output = Path(tmp_dir.name) / f"page_{page_num + 1}.pdf"
|
46
|
+
new_doc.save(str(output))
|
47
|
+
new_doc.close()
|
48
|
+
paths.append(output)
|
49
|
+
|
50
|
+
doc.close()
|
51
|
+
return paths, tmp_dir
|
@@ -7,6 +7,7 @@ See tests for examples: tests/main/test_string_search.py
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
import difflib
|
10
|
+
import re
|
10
11
|
from typing import List, Tuple
|
11
12
|
|
12
13
|
from nltk.corpus import stopwords
|
@@ -195,8 +196,10 @@ def get_context(
|
|
195
196
|
|
196
197
|
Returns:
|
197
198
|
str: A string containing b words before, the match, and a words after
|
198
|
-
the best approximate match position of the query in the text.
|
199
|
-
|
199
|
+
the best approximate match position of the query in the text.
|
200
|
+
The text is extracted from the original `text`, preserving formatting,
|
201
|
+
whitespace, etc, so it does not disturb any downstream processing.
|
202
|
+
If no match is found, returns empty string.
|
200
203
|
int: The start position of the match in the text.
|
201
204
|
int: The end position of the match in the text.
|
202
205
|
|
@@ -204,6 +207,8 @@ def get_context(
|
|
204
207
|
>>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
|
205
208
|
# 'fox jumps over the apple.'
|
206
209
|
"""
|
210
|
+
|
211
|
+
# If no word limits specified, return full text
|
207
212
|
if words_after is None and words_before is None:
|
208
213
|
# return entire text since we're not asked to return a bounded context
|
209
214
|
return text, 0, 0
|
@@ -212,23 +217,35 @@ def get_context(
|
|
212
217
|
if fuzz.partial_ratio(query, text) < 40:
|
213
218
|
return "", 0, 0
|
214
219
|
|
220
|
+
# Find best matching position of query in text
|
215
221
|
sequence_matcher = difflib.SequenceMatcher(None, text, query)
|
216
222
|
match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
|
217
223
|
|
218
224
|
if match.size == 0:
|
219
225
|
return "", 0, 0
|
220
226
|
|
227
|
+
# Count words before match point
|
221
228
|
segments = text.split()
|
222
229
|
n_segs = len(segments)
|
223
|
-
|
224
230
|
start_segment_pos = len(text[: match.a].split())
|
225
231
|
|
232
|
+
# Calculate word window boundaries
|
226
233
|
words_before = words_before or n_segs
|
227
234
|
words_after = words_after or n_segs
|
228
235
|
start_pos = max(0, start_segment_pos - words_before)
|
229
236
|
end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
|
230
237
|
|
231
|
-
|
238
|
+
# Find character positions where words start
|
239
|
+
word_positions = [m.start() for m in re.finditer(r"\S+", text)]
|
240
|
+
|
241
|
+
# Convert word positions to character positions
|
242
|
+
start_char = word_positions[start_pos] if start_pos < len(word_positions) else 0
|
243
|
+
end_char = word_positions[min(end_pos, len(word_positions) - 1)] + len(
|
244
|
+
text.split()[min(end_pos - 1, len(word_positions) - 1)]
|
245
|
+
)
|
246
|
+
|
247
|
+
# return exact substring with original formatting
|
248
|
+
return text[start_char:end_char], start_pos, end_pos
|
232
249
|
|
233
250
|
|
234
251
|
def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{langroid-0.37.0 → langroid-0.37.2}/langroid/agent/special/sql/utils/description_extractors.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/hf_formatter.py
RENAMED
File without changes
|
{langroid-0.37.0 → langroid-0.37.2}/langroid/language_models/prompt_formatter/llama2_formatter.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|