langroid 0.1.72__tar.gz → 0.1.76__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.1.72 → langroid-0.1.76}/PKG-INFO +9 -10
- {langroid-0.1.72 → langroid-0.1.76}/README.md +7 -9
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/doc_chat_agent.py +6 -5
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/retriever_agent.py +0 -1
- langroid-0.1.72/langroid/parsing/pdf_parser.py → langroid-0.1.76/langroid/parsing/document_parser.py +122 -25
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/parser.py +5 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/repo_loader.py +6 -6
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/url_loader.py +6 -6
- {langroid-0.1.72 → langroid-0.1.76}/langroid/vector_store/base.py +13 -4
- {langroid-0.1.72 → langroid-0.1.76}/langroid/vector_store/chromadb.py +0 -1
- {langroid-0.1.72 → langroid-0.1.76}/langroid/vector_store/qdrantdb.py +0 -2
- {langroid-0.1.72 → langroid-0.1.76}/pyproject.toml +2 -1
- langroid-0.1.76/setup.py +104 -0
- langroid-0.1.72/setup.py +0 -103
- {langroid-0.1.72 → langroid-0.1.76}/LICENSE +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/base.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/chat_agent.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/chat_document.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/helpers.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/junk +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/recipient_validator_agent.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/utils/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/utils/system_message.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/sql/utils/tools.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/special/table_chat_agent.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/task.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/tool_message.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/tools/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/tools/google_search_tool.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent/tools/recipient_tool.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/agent_config.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/cachedb/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/cachedb/base.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/cachedb/momento_cachedb.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/cachedb/redis_cachedb.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/embedding_models/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/embedding_models/base.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/embedding_models/clustering.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/embedding_models/models.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/azure_openai.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/base.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/config.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/openai_gpt.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/prompt_formatter/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/prompt_formatter/base.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/language_models/utils.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/mytypes.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/agent_chats.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/code-parsing.md +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/code_parser.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/config.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/json.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/para_sentence_split.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/search.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/table_loader.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/url_loader_cookies.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/urls.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/utils.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/parsing/web_search.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/prompts/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/prompts/dialog.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/prompts/prompts_config.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/prompts/templates.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/prompts/transforms.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/scripts/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/configuration.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/constants.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/docker.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/globals.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/llms/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/llms/strings.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/logging.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/output/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/output/printing.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/pydantic_utils.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/system.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/web/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/web/login.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/utils/web/selenium_login.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/vector_store/__init__.py +0 -0
- {langroid-0.1.72 → langroid-0.1.76}/langroid/vector_store/qdrant_cloud.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.76
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
License: MIT
|
6
6
|
Author: Prasad Chalasani
|
@@ -70,6 +70,7 @@ Requires-Dist: trafilatura (>=1.5.0,<2.0.0)
|
|
70
70
|
Requires-Dist: typer (>=0.7.0,<0.8.0)
|
71
71
|
Requires-Dist: types-redis (>=4.5.5.2,<5.0.0.0)
|
72
72
|
Requires-Dist: types-requests (>=2.31.0.1,<3.0.0.0)
|
73
|
+
Requires-Dist: unstructured[docx,pdf,pptx] (>=0.10.16,<0.11.0)
|
73
74
|
Requires-Dist: wget (>=3.2,<4.0)
|
74
75
|
Description-Content-Type: text/markdown
|
75
76
|
|
@@ -130,8 +131,8 @@ This Multi-Agent paradigm is inspired by the
|
|
130
131
|
[Actor Framework](https://en.wikipedia.org/wiki/Actor_model)
|
131
132
|
(but you do not need to know anything about this!).
|
132
133
|
|
133
|
-
Langroid is a fresh take on LLM app-development, where considerable thought has gone
|
134
|
-
into simplifying the developer experience
|
134
|
+
`Langroid` is a fresh take on LLM app-development, where considerable thought has gone
|
135
|
+
into simplifying the developer experience; it does not use `Langchain`.
|
135
136
|
|
136
137
|
We welcome contributions -- See the [contributions](./CONTRIBUTING.md) document
|
137
138
|
for ideas on what to contribute.
|
@@ -142,6 +143,8 @@ for ideas on what to contribute.
|
|
142
143
|
<summary> <b>:fire: Updates/Releases</b></summary>
|
143
144
|
|
144
145
|
- **Sep 2023:**
|
146
|
+
- **0.1.72:** Many improvements to DocChatAgent: better embedding model,
|
147
|
+
hybrid search to improve retrieval, better pdf parsing, re-ranking retrieved results with cross-encoders.
|
145
148
|
- **Use with local LLama Models:** see tutorial [here](https://langroid.github.io/langroid/blog/2023/09/14/using-langroid-with-local-llms/)
|
146
149
|
- **Langroid Blog/Newsletter Launched!**: First post is [here](https://substack.com/notes/post/p-136704592) -- Please subscribe to stay updated.
|
147
150
|
- **0.1.56:** Support Azure OpenAI.
|
@@ -167,7 +170,7 @@ See [this test](tests/main/test_recipient_tool.py) for example usage.
|
|
167
170
|
- **0.1.27**: Added [support](langroid/cachedb/momento_cachedb.py)
|
168
171
|
for [Momento Serverless Cache](https://www.gomomento.com/) as an alternative to Redis.
|
169
172
|
- **0.1.24**: [`DocChatAgent`](langroid/agent/special/doc_chat_agent.py)
|
170
|
-
now [accepts](langroid/parsing/
|
173
|
+
now [accepts](langroid/parsing/document_parser.py) PDF files or URLs.
|
171
174
|
|
172
175
|
</details>
|
173
176
|
|
@@ -231,9 +234,6 @@ Here is what it looks like in action:
|
|
231
234
|
|
232
235
|
# :gear: Installation and Setup
|
233
236
|
|
234
|
-
:whale: For a simpler setup, see the Docker section below, which lets you get started just
|
235
|
-
by setting up environment variables in a `.env` file.
|
236
|
-
|
237
237
|
### Install `langroid`
|
238
238
|
Langroid requires Python 3.11+. We recommend using a virtual environment.
|
239
239
|
Use `pip` to install `langroid` (from PyPi) to your virtual environment:
|
@@ -663,6 +663,7 @@ First create a `DocChatAgentConfig` instance, with a
|
|
663
663
|
|
664
664
|
```python
|
665
665
|
from langroid.agent.doc_chat_agent import DocChatAgentConfig
|
666
|
+
from langroid.vector_store.qdrantdb import QdrantDBConfig
|
666
667
|
config = DocChatAgentConfig(
|
667
668
|
doc_paths = [
|
668
669
|
"https://en.wikipedia.org/wiki/Language_model",
|
@@ -672,9 +673,7 @@ config = DocChatAgentConfig(
|
|
672
673
|
llm = OpenAIGPTConfig(
|
673
674
|
chat_model=OpenAIChatModel.GPT4,
|
674
675
|
),
|
675
|
-
vecdb=
|
676
|
-
type="qdrant",
|
677
|
-
),
|
676
|
+
vecdb=QdrantDBConfig()
|
678
677
|
)
|
679
678
|
```
|
680
679
|
|
@@ -55,8 +55,8 @@ This Multi-Agent paradigm is inspired by the
|
|
55
55
|
[Actor Framework](https://en.wikipedia.org/wiki/Actor_model)
|
56
56
|
(but you do not need to know anything about this!).
|
57
57
|
|
58
|
-
Langroid is a fresh take on LLM app-development, where considerable thought has gone
|
59
|
-
into simplifying the developer experience
|
58
|
+
`Langroid` is a fresh take on LLM app-development, where considerable thought has gone
|
59
|
+
into simplifying the developer experience; it does not use `Langchain`.
|
60
60
|
|
61
61
|
We welcome contributions -- See the [contributions](./CONTRIBUTING.md) document
|
62
62
|
for ideas on what to contribute.
|
@@ -67,6 +67,8 @@ for ideas on what to contribute.
|
|
67
67
|
<summary> <b>:fire: Updates/Releases</b></summary>
|
68
68
|
|
69
69
|
- **Sep 2023:**
|
70
|
+
- **0.1.72:** Many improvements to DocChatAgent: better embedding model,
|
71
|
+
hybrid search to improve retrieval, better pdf parsing, re-ranking retrieved results with cross-encoders.
|
70
72
|
- **Use with local LLama Models:** see tutorial [here](https://langroid.github.io/langroid/blog/2023/09/14/using-langroid-with-local-llms/)
|
71
73
|
- **Langroid Blog/Newsletter Launched!**: First post is [here](https://substack.com/notes/post/p-136704592) -- Please subscribe to stay updated.
|
72
74
|
- **0.1.56:** Support Azure OpenAI.
|
@@ -92,7 +94,7 @@ See [this test](tests/main/test_recipient_tool.py) for example usage.
|
|
92
94
|
- **0.1.27**: Added [support](langroid/cachedb/momento_cachedb.py)
|
93
95
|
for [Momento Serverless Cache](https://www.gomomento.com/) as an alternative to Redis.
|
94
96
|
- **0.1.24**: [`DocChatAgent`](langroid/agent/special/doc_chat_agent.py)
|
95
|
-
now [accepts](langroid/parsing/
|
97
|
+
now [accepts](langroid/parsing/document_parser.py) PDF files or URLs.
|
96
98
|
|
97
99
|
</details>
|
98
100
|
|
@@ -156,9 +158,6 @@ Here is what it looks like in action:
|
|
156
158
|
|
157
159
|
# :gear: Installation and Setup
|
158
160
|
|
159
|
-
:whale: For a simpler setup, see the Docker section below, which lets you get started just
|
160
|
-
by setting up environment variables in a `.env` file.
|
161
|
-
|
162
161
|
### Install `langroid`
|
163
162
|
Langroid requires Python 3.11+. We recommend using a virtual environment.
|
164
163
|
Use `pip` to install `langroid` (from PyPi) to your virtual environment:
|
@@ -588,6 +587,7 @@ First create a `DocChatAgentConfig` instance, with a
|
|
588
587
|
|
589
588
|
```python
|
590
589
|
from langroid.agent.doc_chat_agent import DocChatAgentConfig
|
590
|
+
from langroid.vector_store.qdrantdb import QdrantDBConfig
|
591
591
|
config = DocChatAgentConfig(
|
592
592
|
doc_paths = [
|
593
593
|
"https://en.wikipedia.org/wiki/Language_model",
|
@@ -597,9 +597,7 @@ config = DocChatAgentConfig(
|
|
597
597
|
llm = OpenAIGPTConfig(
|
598
598
|
chat_model=OpenAIChatModel.GPT4,
|
599
599
|
),
|
600
|
-
vecdb=
|
601
|
-
type="qdrant",
|
602
|
-
),
|
600
|
+
vecdb=QdrantDBConfig()
|
603
601
|
)
|
604
602
|
```
|
605
603
|
|
@@ -81,6 +81,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
81
81
|
# and use the embed(A) to find similar chunks in vecdb.
|
82
82
|
# Referred to as HyDE in the paper:
|
83
83
|
# https://arxiv.org/pdf/2212.10496.pdf
|
84
|
+
# It is False by default; its benefits depends on the context.
|
84
85
|
hypothetical_answer: bool = False
|
85
86
|
n_query_rephrases: int = 0
|
86
87
|
use_fuzzy_match: bool = True
|
@@ -125,7 +126,6 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
125
126
|
)
|
126
127
|
|
127
128
|
vecdb: VectorStoreConfig = QdrantDBConfig(
|
128
|
-
type="qdrant",
|
129
129
|
collection_name=None,
|
130
130
|
storage_path=".qdrant/data/",
|
131
131
|
embedding=hf_embed_config,
|
@@ -392,13 +392,14 @@ class DocChatAgent(ChatAgent):
|
|
392
392
|
if self.config.hypothetical_answer:
|
393
393
|
with console.status("[cyan]LLM generating hypothetical answer..."):
|
394
394
|
with StreamingIfAllowed(self.llm, False):
|
395
|
+
# TODO: provide an easy way to
|
396
|
+
# Adjust this prompt depending on context.
|
395
397
|
answer = self.llm_response_forget(
|
396
398
|
f"""
|
397
|
-
Give
|
399
|
+
Give an ideal answer to the following query,
|
398
400
|
in up to 3 sentences. Do not explain yourself,
|
399
401
|
and do not apologize, just show
|
400
|
-
a possible answer
|
401
|
-
even if you do not have any information.
|
402
|
+
a good possible answer, even if you do not have any information.
|
402
403
|
Preface your answer with "HYPOTHETICAL ANSWER: "
|
403
404
|
|
404
405
|
QUERY: {query}
|
@@ -505,7 +506,7 @@ class DocChatAgent(ChatAgent):
|
|
505
506
|
|
506
507
|
with console.status("[cyan]LLM Extracting verbatim passages..."):
|
507
508
|
with StreamingIfAllowed(self.llm, False):
|
508
|
-
# these are async calls, one per passage
|
509
|
+
# these are async calls, one per passage; turn off streaming
|
509
510
|
extracts = self.llm.get_verbatim_extracts(query, passages)
|
510
511
|
extracts = [e for e in extracts if e.content != NO_ANSWER]
|
511
512
|
|
@@ -41,7 +41,6 @@ class RetrieverAgentConfig(DocChatAgentConfig):
|
|
41
41
|
stream: bool = True # allow streaming where needed
|
42
42
|
max_tokens: int = 10000
|
43
43
|
vecdb: VectorStoreConfig = QdrantDBConfig(
|
44
|
-
type="qdrant",
|
45
44
|
collection_name=None,
|
46
45
|
storage_path=".qdrant/data/",
|
47
46
|
embedding=OpenAIEmbeddingsConfig(
|
langroid-0.1.72/langroid/parsing/pdf_parser.py → langroid-0.1.76/langroid/parsing/document_parser.py
RENAMED
@@ -1,5 +1,6 @@
|
|
1
1
|
import re
|
2
2
|
from abc import abstractmethod
|
3
|
+
from enum import Enum
|
3
4
|
from io import BytesIO
|
4
5
|
from typing import Any, Generator, List, Tuple
|
5
6
|
|
@@ -12,35 +13,56 @@ from langroid.mytypes import DocMetaData, Document
|
|
12
13
|
from langroid.parsing.parser import Parser, ParsingConfig
|
13
14
|
|
14
15
|
|
15
|
-
class
|
16
|
+
class DocumentType(str, Enum):
|
17
|
+
PDF = "pdf"
|
18
|
+
DOCX = "docx"
|
19
|
+
|
20
|
+
|
21
|
+
class DocumentParser(Parser):
|
16
22
|
"""
|
17
|
-
Abstract base class for extracting text from
|
23
|
+
Abstract base class for extracting text from special types of docs
|
24
|
+
such as PDFs or Docx.
|
18
25
|
|
19
26
|
Attributes:
|
20
|
-
source (str): The
|
21
|
-
|
27
|
+
source (str): The source, either a URL or a file path.
|
28
|
+
doc_bytes (BytesIO): BytesIO object containing the doc data.
|
22
29
|
"""
|
23
30
|
|
24
31
|
@classmethod
|
25
|
-
def create(cls, source: str, config: ParsingConfig) -> "
|
32
|
+
def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
|
26
33
|
"""
|
27
|
-
Create a
|
34
|
+
Create a DocumentParser instance based on source type
|
35
|
+
and config.<source_type>.library specified.
|
28
36
|
|
29
37
|
Args:
|
30
38
|
source (str): The source of the PDF, either a URL or a file path.
|
31
39
|
config (ParserConfig): The parser configuration.
|
32
40
|
|
33
41
|
Returns:
|
34
|
-
|
35
|
-
"""
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
+
DocumentParser: An instance of a DocumentParser subclass.
|
43
|
+
"""
|
44
|
+
if DocumentParser._document_type(source) == DocumentType.PDF:
|
45
|
+
if config.pdf.library == "fitz":
|
46
|
+
return FitzPDFParser(source, config)
|
47
|
+
elif config.pdf.library == "pypdf":
|
48
|
+
return PyPDFParser(source, config)
|
49
|
+
elif config.pdf.library == "pdfplumber":
|
50
|
+
return PDFPlumberParser(source, config)
|
51
|
+
elif config.pdf.library == "unstructured":
|
52
|
+
return UnstructuredPDFParser(source, config)
|
53
|
+
else:
|
54
|
+
raise ValueError(
|
55
|
+
f"Unsupported PDF library specified: {config.pdf.library}"
|
56
|
+
)
|
57
|
+
elif DocumentParser._document_type(source) == DocumentType.DOCX:
|
58
|
+
if config.docx.library == "unstructured":
|
59
|
+
return UnstructuredDocxParser(source, config)
|
60
|
+
else:
|
61
|
+
raise ValueError(
|
62
|
+
f"Unsupported DOCX library specified: {config.docx.library}"
|
63
|
+
)
|
42
64
|
else:
|
43
|
-
raise ValueError(f"Unsupported
|
65
|
+
raise ValueError(f"Unsupported document type: {source}")
|
44
66
|
|
45
67
|
def __init__(self, source: str, config: ParsingConfig):
|
46
68
|
"""
|
@@ -52,14 +74,32 @@ class PdfParser(Parser):
|
|
52
74
|
super().__init__(config)
|
53
75
|
self.source = source
|
54
76
|
self.config = config
|
55
|
-
self.
|
77
|
+
self.doc_bytes = self._load_doc_as_bytesio()
|
78
|
+
|
79
|
+
@staticmethod
|
80
|
+
def _document_type(source: str) -> DocumentType:
|
81
|
+
"""
|
82
|
+
Determine the type of document based on the source.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
source (str): The source of the PDF, either a URL or a file path.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
str: The document type.
|
89
|
+
"""
|
90
|
+
if source.lower().endswith(".pdf"):
|
91
|
+
return DocumentType.PDF
|
92
|
+
elif source.lower().endswith(".docx"):
|
93
|
+
return DocumentType.DOCX
|
94
|
+
else:
|
95
|
+
raise ValueError(f"Unsupported document type: {source}")
|
56
96
|
|
57
|
-
def
|
97
|
+
def _load_doc_as_bytesio(self) -> BytesIO:
|
58
98
|
"""
|
59
|
-
Load the
|
99
|
+
Load the docs into a BytesIO object.
|
60
100
|
|
61
101
|
Returns:
|
62
|
-
BytesIO: A BytesIO object containing the
|
102
|
+
BytesIO: A BytesIO object containing the doc data.
|
63
103
|
"""
|
64
104
|
if self.source.startswith(("http://", "https://")):
|
65
105
|
response = requests.get(self.source)
|
@@ -159,7 +199,7 @@ class PdfParser(Parser):
|
|
159
199
|
return docs
|
160
200
|
|
161
201
|
|
162
|
-
class
|
202
|
+
class FitzPDFParser(DocumentParser):
|
163
203
|
"""
|
164
204
|
Parser for processing PDFs using the `fitz` library.
|
165
205
|
"""
|
@@ -171,7 +211,7 @@ class FitzPdfParser(PdfParser):
|
|
171
211
|
Returns:
|
172
212
|
Generator[fitz.Page]: Generator yielding each page.
|
173
213
|
"""
|
174
|
-
doc = fitz.open(stream=self.
|
214
|
+
doc = fitz.open(stream=self.doc_bytes, filetype="pdf")
|
175
215
|
for i, page in enumerate(doc):
|
176
216
|
yield i, page
|
177
217
|
doc.close()
|
@@ -189,7 +229,7 @@ class FitzPdfParser(PdfParser):
|
|
189
229
|
return self.fix_text(page.get_text())
|
190
230
|
|
191
231
|
|
192
|
-
class
|
232
|
+
class PyPDFParser(DocumentParser):
|
193
233
|
"""
|
194
234
|
Parser for processing PDFs using the `pypdf` library.
|
195
235
|
"""
|
@@ -201,7 +241,7 @@ class PyPdfParser(PdfParser):
|
|
201
241
|
Returns:
|
202
242
|
Generator[pypdf.pdf.PageObject]: Generator yielding each page.
|
203
243
|
"""
|
204
|
-
reader = pypdf.PdfReader(self.
|
244
|
+
reader = pypdf.PdfReader(self.doc_bytes)
|
205
245
|
for i, page in enumerate(reader.pages):
|
206
246
|
yield i, page
|
207
247
|
|
@@ -218,7 +258,7 @@ class PyPdfParser(PdfParser):
|
|
218
258
|
return self.fix_text(page.extract_text())
|
219
259
|
|
220
260
|
|
221
|
-
class
|
261
|
+
class PDFPlumberParser(DocumentParser):
|
222
262
|
"""
|
223
263
|
Parser for processing PDFs using the `pdfplumber` library.
|
224
264
|
"""
|
@@ -232,7 +272,7 @@ class PdfPlumberParser(PdfParser):
|
|
232
272
|
Returns:
|
233
273
|
Generator[pdfplumber.Page]: Generator yielding each page.
|
234
274
|
"""
|
235
|
-
with pdfplumber.open(self.
|
275
|
+
with pdfplumber.open(self.doc_bytes) as pdf:
|
236
276
|
for i, page in enumerate(pdf.pages):
|
237
277
|
yield i, page
|
238
278
|
|
@@ -247,3 +287,60 @@ class PdfPlumberParser(PdfParser):
|
|
247
287
|
str: Extracted text from the page.
|
248
288
|
"""
|
249
289
|
return self.fix_text(page.extract_text())
|
290
|
+
|
291
|
+
|
292
|
+
class UnstructuredPDFParser(DocumentParser):
|
293
|
+
"""
|
294
|
+
Parser for processing PDF files using the `unstructured` library.
|
295
|
+
"""
|
296
|
+
|
297
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
298
|
+
from unstructured.partition.pdf import partition_pdf
|
299
|
+
|
300
|
+
elements = partition_pdf(file=self.doc_bytes, include_page_breaks=True)
|
301
|
+
for i, el in enumerate(elements):
|
302
|
+
yield i, el
|
303
|
+
|
304
|
+
def extract_text_from_page(self, page: Any) -> str:
|
305
|
+
"""
|
306
|
+
Extract text from a given `unstructured` element.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
page (unstructured element): The `unstructured` element object.
|
310
|
+
|
311
|
+
Returns:
|
312
|
+
str: Extracted text from the element.
|
313
|
+
"""
|
314
|
+
return self.fix_text(str(page))
|
315
|
+
|
316
|
+
|
317
|
+
class UnstructuredDocxParser(DocumentParser):
|
318
|
+
"""
|
319
|
+
Parser for processing DOCX files using the `unstructured` library.
|
320
|
+
"""
|
321
|
+
|
322
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
323
|
+
from unstructured.partition.docx import partition_docx
|
324
|
+
|
325
|
+
elements = partition_docx(file=self.doc_bytes)
|
326
|
+
for i, el in enumerate(elements):
|
327
|
+
yield i, el
|
328
|
+
|
329
|
+
def extract_text_from_page(self, page: Any) -> str:
|
330
|
+
"""
|
331
|
+
Extract text from a given `unstructured` element.
|
332
|
+
|
333
|
+
Note:
|
334
|
+
The concept of "pages" doesn't actually exist in the .docx file format in
|
335
|
+
the same way it does in formats like .pdf. A .docx file is made up of a
|
336
|
+
series of elements like paragraphs and tables, but the division into
|
337
|
+
pages is done dynamically based on the rendering settings (like the page
|
338
|
+
size, margin size, font size, etc.).
|
339
|
+
|
340
|
+
Args:
|
341
|
+
page (unstructured element): The `unstructured` element object.
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
str: Extracted text from the element.
|
345
|
+
"""
|
346
|
+
return self.fix_text(str(page))
|
@@ -23,6 +23,10 @@ class PdfParsingConfig(BaseSettings):
|
|
23
23
|
library: str = "pdfplumber"
|
24
24
|
|
25
25
|
|
26
|
+
class DocxParsingConfig(BaseSettings):
|
27
|
+
library: str = "unstructured"
|
28
|
+
|
29
|
+
|
26
30
|
class ParsingConfig(BaseSettings):
|
27
31
|
splitter: str = Splitter.TOKENS
|
28
32
|
chunk_size: int = 200 # aim for this many tokens per chunk
|
@@ -35,6 +39,7 @@ class ParsingConfig(BaseSettings):
|
|
35
39
|
separators: List[str] = ["\n\n", "\n", " ", ""]
|
36
40
|
token_encoding_model: str = "text-embedding-ada-002"
|
37
41
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
42
|
+
docx: DocxParsingConfig = DocxParsingConfig()
|
38
43
|
|
39
44
|
|
40
45
|
class Parser:
|
@@ -18,8 +18,8 @@ from github.Repository import Repository
|
|
18
18
|
from pydantic import BaseSettings
|
19
19
|
|
20
20
|
from langroid.mytypes import DocMetaData, Document
|
21
|
-
from langroid.parsing.
|
22
|
-
from langroid.parsing.
|
21
|
+
from langroid.parsing.document_parser import DocumentParser
|
22
|
+
from langroid.parsing.parser import Parser, ParsingConfig
|
23
23
|
|
24
24
|
logger = logging.getLogger(__name__)
|
25
25
|
|
@@ -440,7 +440,7 @@ class RepoLoader:
|
|
440
440
|
@staticmethod
|
441
441
|
def get_documents(
|
442
442
|
path: str,
|
443
|
-
parser: Parser,
|
443
|
+
parser: Parser = Parser(ParsingConfig()),
|
444
444
|
file_types: Optional[List[str]] = None,
|
445
445
|
exclude_dirs: Optional[List[str]] = None,
|
446
446
|
depth: int = -1,
|
@@ -493,12 +493,12 @@ class RepoLoader:
|
|
493
493
|
|
494
494
|
for file_path in file_paths:
|
495
495
|
_, file_extension = os.path.splitext(file_path)
|
496
|
-
if file_extension.lower()
|
497
|
-
|
496
|
+
if file_extension.lower() in [".pdf", ".docx"]:
|
497
|
+
doc_parser = DocumentParser.create(
|
498
498
|
file_path,
|
499
499
|
parser.config,
|
500
500
|
)
|
501
|
-
docs.extend(
|
501
|
+
docs.extend(doc_parser.get_doc_chunks())
|
502
502
|
else:
|
503
503
|
with open(file_path, "r") as f:
|
504
504
|
if lines is not None:
|
@@ -9,8 +9,8 @@ from trafilatura.downloads import (
|
|
9
9
|
)
|
10
10
|
|
11
11
|
from langroid.mytypes import DocMetaData, Document
|
12
|
-
from langroid.parsing.
|
13
|
-
from langroid.parsing.
|
12
|
+
from langroid.parsing.document_parser import DocumentParser
|
13
|
+
from langroid.parsing.parser import Parser, ParsingConfig
|
14
14
|
|
15
15
|
logging.getLogger("trafilatura").setLevel(logging.ERROR)
|
16
16
|
|
@@ -27,7 +27,7 @@ class URLLoader:
|
|
27
27
|
the "accept" button on the cookie dialog.
|
28
28
|
"""
|
29
29
|
|
30
|
-
def __init__(self, urls: List[str], parser: Parser):
|
30
|
+
def __init__(self, urls: List[str], parser: Parser = Parser(ParsingConfig())):
|
31
31
|
self.urls = urls
|
32
32
|
self.parser = parser
|
33
33
|
|
@@ -44,12 +44,12 @@ class URLLoader:
|
|
44
44
|
sleep_time=5,
|
45
45
|
)
|
46
46
|
for url, result in buffered_downloads(buffer, threads):
|
47
|
-
if url.lower().endswith(".pdf"):
|
48
|
-
|
47
|
+
if url.lower().endswith(".pdf") or url.lower().endswith(".docx"):
|
48
|
+
doc_parser = DocumentParser.create(
|
49
49
|
url,
|
50
50
|
self.parser.config,
|
51
51
|
)
|
52
|
-
docs.extend(
|
52
|
+
docs.extend(doc_parser.get_doc_chunks())
|
53
53
|
else:
|
54
54
|
text = trafilatura.extract(
|
55
55
|
result,
|
@@ -14,6 +14,7 @@ logger = logging.getLogger(__name__)
|
|
14
14
|
|
15
15
|
|
16
16
|
class VectorStoreConfig(BaseSettings):
|
17
|
+
type: str = "qdrant" # deprecated, keeping it for backward compatibility
|
17
18
|
collection_name: str | None = None
|
18
19
|
replace_collection: bool = False # replace collection if it already exists
|
19
20
|
storage_path: str = ".qdrant/data"
|
@@ -23,7 +24,6 @@ class VectorStoreConfig(BaseSettings):
|
|
23
24
|
model_type="openai",
|
24
25
|
)
|
25
26
|
timeout: int = 60
|
26
|
-
type: str = "qdrant"
|
27
27
|
host: str = "127.0.0.1"
|
28
28
|
port: int = 6333
|
29
29
|
# compose_file: str = "langroid/vector_store/docker-compose-qdrant.yml"
|
@@ -38,7 +38,7 @@ class VectorStore(ABC):
|
|
38
38
|
self.config = config
|
39
39
|
|
40
40
|
@staticmethod
|
41
|
-
def create(config: VectorStoreConfig) -> "VectorStore":
|
41
|
+
def create(config: VectorStoreConfig) -> Optional["VectorStore"]:
|
42
42
|
from langroid.vector_store.chromadb import ChromaDB, ChromaDBConfig
|
43
43
|
from langroid.vector_store.qdrantdb import QdrantDB, QdrantDBConfig
|
44
44
|
|
@@ -47,7 +47,16 @@ class VectorStore(ABC):
|
|
47
47
|
elif isinstance(config, ChromaDBConfig):
|
48
48
|
return ChromaDB(config)
|
49
49
|
else:
|
50
|
-
|
50
|
+
logger.warning(
|
51
|
+
f"""
|
52
|
+
Unknown vector store config: {config.__repr_name__()},
|
53
|
+
so skipping vector store creation!
|
54
|
+
If you intended to use a vector-store, please set a specific
|
55
|
+
vector-store in your script, typically in the `vecdb` field of a
|
56
|
+
`ChatAgentConfig`, otherwise set it to None.
|
57
|
+
"""
|
58
|
+
)
|
59
|
+
return None
|
51
60
|
|
52
61
|
@abstractmethod
|
53
62
|
def clear_empty_collections(self) -> int:
|
@@ -123,4 +132,4 @@ class VectorStore(ABC):
|
|
123
132
|
def show_if_debug(self, doc_score_pairs: List[Tuple[Document, float]]) -> None:
|
124
133
|
if settings.debug:
|
125
134
|
for i, (d, s) in enumerate(doc_score_pairs):
|
126
|
-
print_long_text("red", "italic red", f"
|
135
|
+
print_long_text("red", "italic red", f"\nMATCH-{i}\n", d.content)
|
@@ -28,9 +28,7 @@ logger = logging.getLogger(__name__)
|
|
28
28
|
|
29
29
|
|
30
30
|
class QdrantDBConfig(VectorStoreConfig):
|
31
|
-
type: str = "qdrant"
|
32
31
|
cloud: bool = True
|
33
|
-
|
34
32
|
collection_name: str | None = None
|
35
33
|
storage_path: str = ".qdrant/data"
|
36
34
|
embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "langroid"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.76"
|
4
4
|
description = "Harness LLMs with Multi-Agent Programming"
|
5
5
|
authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -54,6 +54,7 @@ sqlalchemy = "^2.0.19"
|
|
54
54
|
prettytable = "^3.8.0"
|
55
55
|
google-api-python-client = "^2.95.0"
|
56
56
|
lxml = "^4.9.3"
|
57
|
+
unstructured = {extras = ["docx", "pptx", "pdf"], version = "^0.10.16"}
|
57
58
|
|
58
59
|
# optional; see extras section below
|
59
60
|
sentence-transformers = {version="2.2.2", optional=true}
|