langroid 0.1.217__tar.gz → 0.1.219__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.1.217 → langroid-0.1.219}/PKG-INFO +3 -2
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/doc_chat_agent.py +54 -25
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/document_parser.py +145 -22
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/parse_json.py +18 -24
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/repo_loader.py +69 -49
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/urls.py +18 -9
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/utils.py +27 -9
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/system.py +1 -1
- {langroid-0.1.217 → langroid-0.1.219}/pyproject.toml +3 -2
- {langroid-0.1.217 → langroid-0.1.219}/LICENSE +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/README.md +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/base.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/batch.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/callbacks/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/callbacks/chainlit.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/chat_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/chat_document.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/helpers.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/junk +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/openai_assistant.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_tools.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/utils/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/utils/system_message.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/relevance_extractor_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/retriever_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/system_message.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/tools.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/table_chat_agent.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/task.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tool_message.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/extract_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/generator_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/google_search_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/metaphor_search_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/recipient_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/run_python_code.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/sciphi_search_rag_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/segment_extract_tool.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/agent_config.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/base.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/momento_cachedb.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/redis_cachedb.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/base.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/clustering.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/models.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings.proto +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/remote_embeds.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/azure_openai.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/base.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/config.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/openai_assistants.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/openai_gpt.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/base.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/utils.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/mytypes.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/agent_chats.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/code-parsing.md +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/code_parser.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/config.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/image_text.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/para_sentence_split.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/parser.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/search.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/spider.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/table_loader.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/url_loader.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/url_loader_cookies.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/web_search.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/chat-gpt4-system-prompt.md +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/dialog.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/prompts_config.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/templates.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/transforms.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/algorithms/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/algorithms/graph.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/configuration.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/constants.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/docker.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/globals.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/llms/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/llms/strings.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/logging.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/output/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/output/printing.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/output/status.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/pandas_utils.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/pydantic_utils.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/web/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/web/login.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/__init__.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/base.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/chromadb.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/lancedb.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/meilisearch.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/momento.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/qdrant_cloud.py +0 -0
- {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/qdrantdb.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.219
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
License: MIT
|
6
6
|
Author: Prasad Chalasani
|
@@ -85,7 +85,7 @@ Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
|
|
85
85
|
Requires-Dist: python-docx (>=1.1.0,<2.0.0)
|
86
86
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
87
87
|
Requires-Dist: python-socketio (>=5.11.0,<6.0.0) ; extra == "chainlit"
|
88
|
-
Requires-Dist: qdrant-client (>=1.
|
88
|
+
Requires-Dist: qdrant-client (>=1.8.0,<2.0.0)
|
89
89
|
Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
|
90
90
|
Requires-Dist: redis (>=5.0.1,<6.0.0)
|
91
91
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
@@ -101,6 +101,7 @@ Requires-Dist: tiktoken (>=0.5.1,<0.6.0)
|
|
101
101
|
Requires-Dist: torch (==2.0.0) ; extra == "hf-embeddings"
|
102
102
|
Requires-Dist: trafilatura (>=1.5.0,<2.0.0)
|
103
103
|
Requires-Dist: typer (>=0.9.0,<0.10.0)
|
104
|
+
Requires-Dist: types-pyyaml (>=6.0.12.20240311,<7.0.0.0)
|
104
105
|
Requires-Dist: types-redis (>=4.5.5.2,<5.0.0.0)
|
105
106
|
Requires-Dist: types-requests (>=2.31.0.1,<3.0.0.0)
|
106
107
|
Requires-Dist: unstructured[docx,pdf,pptx] (>=0.10.16,<0.10.18) ; extra == "unstructured"
|
@@ -35,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
|
|
35
35
|
from langroid.language_models.base import StreamingIfAllowed
|
36
36
|
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
|
37
37
|
from langroid.mytypes import DocMetaData, Document, Entity
|
38
|
+
from langroid.parsing.document_parser import DocumentType
|
38
39
|
from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
|
39
40
|
from langroid.parsing.repo_loader import RepoLoader
|
40
41
|
from langroid.parsing.search import (
|
@@ -44,7 +45,7 @@ from langroid.parsing.search import (
|
|
44
45
|
)
|
45
46
|
from langroid.parsing.table_loader import describe_dataframe
|
46
47
|
from langroid.parsing.url_loader import URLLoader
|
47
|
-
from langroid.parsing.urls import get_list_from_user,
|
48
|
+
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
48
49
|
from langroid.parsing.utils import batched
|
49
50
|
from langroid.prompts.prompts_config import PromptsConfig
|
50
51
|
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
@@ -126,7 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
126
127
|
llm=None # use the parent's llm unless explicitly set here
|
127
128
|
)
|
128
129
|
)
|
129
|
-
doc_paths: List[str] = []
|
130
|
+
doc_paths: List[str | bytes] = []
|
130
131
|
default_paths: List[str] = [
|
131
132
|
"https://news.ycombinator.com/item?id=35629033",
|
132
133
|
"https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
|
@@ -248,62 +249,84 @@ class DocChatAgent(ChatAgent):
|
|
248
249
|
raise ValueError("VecDB not set")
|
249
250
|
self.setup_documents(filter=self.config.filter)
|
250
251
|
return
|
251
|
-
self.ingest_doc_paths(self.config.doc_paths)
|
252
|
+
self.ingest_doc_paths(self.config.doc_paths) # type: ignore
|
252
253
|
|
253
254
|
def ingest_doc_paths(
|
254
255
|
self,
|
255
|
-
paths: List[str],
|
256
|
+
paths: str | bytes | List[str | bytes],
|
256
257
|
metadata: (
|
257
258
|
List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
|
258
259
|
) = [],
|
260
|
+
doc_type: str | DocumentType | None = None,
|
259
261
|
) -> List[Document]:
|
260
262
|
"""Split, ingest docs from specified paths,
|
261
263
|
do not add these to config.doc_paths.
|
262
264
|
|
263
265
|
Args:
|
264
|
-
paths:
|
266
|
+
paths: document paths, urls or byte-content of docs.
|
267
|
+
The bytes option is intended to support cases where a document
|
268
|
+
has already been read in as bytes (e.g. from an API or a database),
|
269
|
+
and we want to avoid having to write it to a temporary file
|
270
|
+
just to read it back in.
|
265
271
|
metadata: List of metadata dicts, one for each path.
|
266
272
|
If a single dict is passed in, it is used for all paths.
|
273
|
+
doc_type: DocumentType to use for parsing, if known.
|
274
|
+
MUST apply to all docs if specified.
|
275
|
+
This is especially useful when the `paths` are of bytes type,
|
276
|
+
to help with document type detection.
|
267
277
|
Returns:
|
268
278
|
List of Document objects
|
269
279
|
"""
|
280
|
+
if isinstance(paths, str) or isinstance(paths, bytes):
|
281
|
+
paths = [paths]
|
270
282
|
all_paths = paths
|
271
|
-
paths_meta: Dict[
|
272
|
-
urls_meta: Dict[
|
273
|
-
|
283
|
+
paths_meta: Dict[int, Any] = {}
|
284
|
+
urls_meta: Dict[int, Any] = {}
|
285
|
+
idxs = range(len(all_paths))
|
286
|
+
url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
|
287
|
+
urls = [all_paths[i] for i in url_idxs]
|
288
|
+
paths = [all_paths[i] for i in path_idxs]
|
289
|
+
bytes_list = [all_paths[i] for i in bytes_idxs]
|
290
|
+
path_idxs.extend(bytes_idxs)
|
291
|
+
paths.extend(bytes_list)
|
274
292
|
if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
|
275
293
|
metadata, list
|
276
294
|
):
|
277
295
|
if isinstance(metadata, list):
|
278
|
-
|
296
|
+
idx2meta = {
|
279
297
|
p: (
|
280
298
|
m
|
281
299
|
if isinstance(m, dict)
|
282
300
|
else (isinstance(m, DocMetaData) and m.dict())
|
283
301
|
) # appease mypy
|
284
|
-
for p, m in zip(
|
302
|
+
for p, m in zip(idxs, metadata)
|
285
303
|
}
|
286
304
|
elif isinstance(metadata, dict):
|
287
|
-
|
305
|
+
idx2meta = {p: metadata for p in idxs}
|
288
306
|
else:
|
289
|
-
|
290
|
-
urls_meta = {u:
|
291
|
-
paths_meta = {p:
|
307
|
+
idx2meta = {p: metadata.dict() for p in idxs}
|
308
|
+
urls_meta = {u: idx2meta[u] for u in url_idxs}
|
309
|
+
paths_meta = {p: idx2meta[p] for p in path_idxs}
|
292
310
|
docs: List[Document] = []
|
293
311
|
parser = Parser(self.config.parsing)
|
294
312
|
if len(urls) > 0:
|
295
|
-
for
|
296
|
-
meta = urls_meta.get(
|
297
|
-
loader = URLLoader(urls=[
|
313
|
+
for ui in url_idxs:
|
314
|
+
meta = urls_meta.get(ui, {})
|
315
|
+
loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
|
298
316
|
url_docs = loader.load()
|
299
317
|
# update metadata of each doc with meta
|
300
318
|
for d in url_docs:
|
301
319
|
d.metadata = d.metadata.copy(update=meta)
|
302
320
|
docs.extend(url_docs)
|
303
|
-
if len(paths) > 0:
|
304
|
-
for
|
305
|
-
meta = paths_meta.get(
|
306
|
-
|
321
|
+
if len(paths) > 0: # paths OR bytes are handled similarly
|
322
|
+
for pi in path_idxs:
|
323
|
+
meta = paths_meta.get(pi, {})
|
324
|
+
p = all_paths[pi]
|
325
|
+
path_docs = RepoLoader.get_documents(
|
326
|
+
p,
|
327
|
+
parser=parser,
|
328
|
+
doc_type=doc_type,
|
329
|
+
)
|
307
330
|
# update metadata of each doc with meta
|
308
331
|
for d in path_docs:
|
309
332
|
d.metadata = d.metadata.copy(update=meta)
|
@@ -317,11 +340,12 @@ class DocChatAgent(ChatAgent):
|
|
317
340
|
print(
|
318
341
|
f"""
|
319
342
|
[green]I have processed the following {n_urls} URLs
|
320
|
-
and {n_paths}
|
343
|
+
and {n_paths} docs into {n_splits} parts:
|
321
344
|
""".strip()
|
322
345
|
)
|
323
|
-
|
324
|
-
print("\n".join(
|
346
|
+
path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
|
347
|
+
print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
|
348
|
+
print("\n".join(path_reps))
|
325
349
|
return docs
|
326
350
|
|
327
351
|
def ingest_docs(
|
@@ -388,6 +412,7 @@ class DocChatAgent(ChatAgent):
|
|
388
412
|
+ ",content="
|
389
413
|
+ d.content
|
390
414
|
)
|
415
|
+
docs = docs[: self.config.parsing.max_chunks]
|
391
416
|
# add embeddings in batches, to stay under limit of embeddings API
|
392
417
|
batches = list(batched(docs, self.config.embed_batch_size))
|
393
418
|
for batch in batches:
|
@@ -463,6 +488,10 @@ class DocChatAgent(ChatAgent):
|
|
463
488
|
d.metadata.is_chunk = True
|
464
489
|
return self.ingest_docs(docs)
|
465
490
|
|
491
|
+
def set_filter(self, filter: str) -> None:
|
492
|
+
self.config.filter = filter
|
493
|
+
self.setup_documents(filter=filter)
|
494
|
+
|
466
495
|
def setup_documents(
|
467
496
|
self,
|
468
497
|
docs: List[Document] = [],
|
@@ -609,7 +638,7 @@ class DocChatAgent(ChatAgent):
|
|
609
638
|
if len(inputs) == 0:
|
610
639
|
if is_new_collection:
|
611
640
|
inputs = self.config.default_paths
|
612
|
-
self.config.doc_paths = inputs
|
641
|
+
self.config.doc_paths = inputs # type: ignore
|
613
642
|
self.ingest()
|
614
643
|
|
615
644
|
def llm_response(
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import itertools
|
1
2
|
import logging
|
2
3
|
import re
|
3
4
|
from enum import Enum
|
@@ -8,6 +9,7 @@ import fitz
|
|
8
9
|
import pdfplumber
|
9
10
|
import pypdf
|
10
11
|
import requests
|
12
|
+
from bs4 import BeautifulSoup
|
11
13
|
from PIL import Image
|
12
14
|
|
13
15
|
from langroid.mytypes import DocMetaData, Document
|
@@ -20,6 +22,29 @@ class DocumentType(str, Enum):
|
|
20
22
|
PDF = "pdf"
|
21
23
|
DOCX = "docx"
|
22
24
|
DOC = "doc"
|
25
|
+
TXT = "txt"
|
26
|
+
|
27
|
+
|
28
|
+
def is_plain_text(path_or_bytes: str | bytes) -> bool:
|
29
|
+
if isinstance(path_or_bytes, str):
|
30
|
+
if path_or_bytes.startswith(("http://", "https://")):
|
31
|
+
response = requests.get(path_or_bytes)
|
32
|
+
response.raise_for_status()
|
33
|
+
content = response.content[:1024]
|
34
|
+
else:
|
35
|
+
with open(path_or_bytes, "rb") as f:
|
36
|
+
content = f.read(1024)
|
37
|
+
else:
|
38
|
+
content = path_or_bytes[:1024]
|
39
|
+
try:
|
40
|
+
# Attempt to decode the content as UTF-8
|
41
|
+
_ = content.decode("utf-8")
|
42
|
+
# Additional checks can go here, e.g., to verify that the content
|
43
|
+
# doesn't contain too many unusual characters for it to be considered text
|
44
|
+
return True
|
45
|
+
except UnicodeDecodeError:
|
46
|
+
# If decoding fails, it's likely not plain text (or not encoded in UTF-8)
|
47
|
+
return False
|
23
48
|
|
24
49
|
|
25
50
|
class DocumentParser(Parser):
|
@@ -33,19 +58,26 @@ class DocumentParser(Parser):
|
|
33
58
|
"""
|
34
59
|
|
35
60
|
@classmethod
|
36
|
-
def create(
|
61
|
+
def create(
|
62
|
+
cls,
|
63
|
+
source: str | bytes,
|
64
|
+
config: ParsingConfig,
|
65
|
+
doc_type: str | DocumentType | None = None,
|
66
|
+
) -> "DocumentParser":
|
37
67
|
"""
|
38
68
|
Create a DocumentParser instance based on source type
|
39
69
|
and config.<source_type>.library specified.
|
40
70
|
|
41
71
|
Args:
|
42
|
-
source (str): The source
|
72
|
+
source (str|bytes): The source, could be a URL, file path,
|
73
|
+
or bytes object.
|
43
74
|
config (ParserConfig): The parser configuration.
|
75
|
+
doc_type (str|None): The type of document, if known
|
44
76
|
|
45
77
|
Returns:
|
46
78
|
DocumentParser: An instance of a DocumentParser subclass.
|
47
79
|
"""
|
48
|
-
if DocumentParser._document_type(source) == DocumentType.PDF:
|
80
|
+
if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
|
49
81
|
if config.pdf.library == "fitz":
|
50
82
|
return FitzPDFParser(source, config)
|
51
83
|
elif config.pdf.library == "pypdf":
|
@@ -60,7 +92,7 @@ class DocumentParser(Parser):
|
|
60
92
|
raise ValueError(
|
61
93
|
f"Unsupported PDF library specified: {config.pdf.library}"
|
62
94
|
)
|
63
|
-
elif DocumentParser._document_type(source) == DocumentType.DOCX:
|
95
|
+
elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
|
64
96
|
if config.docx.library == "unstructured":
|
65
97
|
return UnstructuredDocxParser(source, config)
|
66
98
|
elif config.docx.library == "python-docx":
|
@@ -69,42 +101,78 @@ class DocumentParser(Parser):
|
|
69
101
|
raise ValueError(
|
70
102
|
f"Unsupported DOCX library specified: {config.docx.library}"
|
71
103
|
)
|
72
|
-
elif DocumentParser._document_type(source) == DocumentType.DOC:
|
104
|
+
elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
|
73
105
|
return UnstructuredDocParser(source, config)
|
74
106
|
else:
|
75
|
-
|
107
|
+
source_name = source if isinstance(source, str) else "bytes"
|
108
|
+
raise ValueError(f"Unsupported document type: {source_name}")
|
76
109
|
|
77
|
-
def __init__(self, source: str, config: ParsingConfig):
|
110
|
+
def __init__(self, source: str | bytes, config: ParsingConfig):
|
78
111
|
"""
|
79
|
-
Initialize the PDFParser.
|
80
|
-
|
81
112
|
Args:
|
82
|
-
source (str): The source
|
113
|
+
source (str|bytes): The source, which could be
|
114
|
+
a path, a URL or a bytes object.
|
83
115
|
"""
|
84
116
|
super().__init__(config)
|
85
|
-
self.source = source
|
86
117
|
self.config = config
|
87
|
-
|
118
|
+
if isinstance(source, bytes):
|
119
|
+
self.source = "bytes"
|
120
|
+
self.doc_bytes = BytesIO(source)
|
121
|
+
else:
|
122
|
+
self.source = source
|
123
|
+
self.doc_bytes = self._load_doc_as_bytesio()
|
88
124
|
|
89
125
|
@staticmethod
|
90
|
-
def _document_type(
|
126
|
+
def _document_type(
|
127
|
+
source: str | bytes, doc_type: str | DocumentType | None = None
|
128
|
+
) -> DocumentType:
|
91
129
|
"""
|
92
130
|
Determine the type of document based on the source.
|
93
131
|
|
94
132
|
Args:
|
95
|
-
source (str): The source
|
133
|
+
source (str|bytes): The source, which could be a URL,
|
134
|
+
a file path, or a bytes object.
|
135
|
+
doc_type (str|DocumentType|None): The type of document, if known.
|
96
136
|
|
97
137
|
Returns:
|
98
138
|
str: The document type.
|
99
139
|
"""
|
100
|
-
if
|
101
|
-
return
|
102
|
-
|
103
|
-
return DocumentType.
|
104
|
-
|
105
|
-
return DocumentType.
|
140
|
+
if isinstance(doc_type, DocumentType):
|
141
|
+
return doc_type
|
142
|
+
if doc_type:
|
143
|
+
return DocumentType(doc_type.lower())
|
144
|
+
if is_plain_text(source):
|
145
|
+
return DocumentType.TXT
|
146
|
+
if isinstance(source, str):
|
147
|
+
# detect file type from path extension
|
148
|
+
if source.lower().endswith(".pdf"):
|
149
|
+
return DocumentType.PDF
|
150
|
+
elif source.lower().endswith(".docx"):
|
151
|
+
return DocumentType.DOCX
|
152
|
+
elif source.lower().endswith(".doc"):
|
153
|
+
return DocumentType.DOC
|
154
|
+
else:
|
155
|
+
raise ValueError(f"Unsupported document type: {source}")
|
106
156
|
else:
|
107
|
-
|
157
|
+
# must be bytes: attempt to detect type from content
|
158
|
+
# using magic mime type detection
|
159
|
+
import magic
|
160
|
+
|
161
|
+
mime_type = magic.from_buffer(source, mime=True)
|
162
|
+
if mime_type == "application/pdf":
|
163
|
+
return DocumentType.PDF
|
164
|
+
elif mime_type in [
|
165
|
+
"application/vnd.openxmlformats-officedocument"
|
166
|
+
".wordprocessingml.document",
|
167
|
+
"application/zip",
|
168
|
+
]:
|
169
|
+
# DOCX files are essentially ZIP files,
|
170
|
+
# but this might catch other ZIP-based formats too!
|
171
|
+
return DocumentType.DOCX
|
172
|
+
elif mime_type == "application/msword":
|
173
|
+
return DocumentType.DOC
|
174
|
+
else:
|
175
|
+
raise ValueError("Unsupported document type from bytes")
|
108
176
|
|
109
177
|
def _load_doc_as_bytesio(self) -> BytesIO:
|
110
178
|
"""
|
@@ -121,6 +189,61 @@ class DocumentParser(Parser):
|
|
121
189
|
with open(self.source, "rb") as f:
|
122
190
|
return BytesIO(f.read())
|
123
191
|
|
192
|
+
@staticmethod
|
193
|
+
def chunks_from_path_or_bytes(
|
194
|
+
source: str | bytes,
|
195
|
+
parser: Parser,
|
196
|
+
doc_type: str | DocumentType | None = None,
|
197
|
+
lines: int | None = None,
|
198
|
+
) -> List[Document]:
|
199
|
+
"""
|
200
|
+
Get document chunks from a file path or bytes object.
|
201
|
+
Args:
|
202
|
+
source (str|bytes): The source, which could be a URL, path or bytes object.
|
203
|
+
parser (Parser): The parser instance (for splitting the document).
|
204
|
+
doc_type (str|DocumentType|None): The type of document, if known.
|
205
|
+
lines (int|None): The number of lines to read from a plain text file.
|
206
|
+
Returns:
|
207
|
+
List[Document]: A list of `Document` objects,
|
208
|
+
each containing a chunk of text, determined by the
|
209
|
+
chunking and splitting settings in the parser config.
|
210
|
+
"""
|
211
|
+
dtype: DocumentType = DocumentParser._document_type(source, doc_type)
|
212
|
+
if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
213
|
+
doc_parser = DocumentParser.create(
|
214
|
+
source,
|
215
|
+
parser.config,
|
216
|
+
doc_type=doc_type,
|
217
|
+
)
|
218
|
+
chunks = doc_parser.get_doc_chunks()
|
219
|
+
if len(chunks) == 0 and dtype == DocumentType.PDF:
|
220
|
+
doc_parser = ImagePdfParser(source, parser.config)
|
221
|
+
chunks = doc_parser.get_doc_chunks()
|
222
|
+
return chunks
|
223
|
+
else:
|
224
|
+
# try getting as plain text; these will be chunked downstream
|
225
|
+
# -- could be a bytes object or a path
|
226
|
+
if isinstance(source, bytes):
|
227
|
+
content = source.decode()
|
228
|
+
if lines is not None:
|
229
|
+
file_lines = content.splitlines()[:lines]
|
230
|
+
content = "\n".join(line.strip() for line in file_lines)
|
231
|
+
else:
|
232
|
+
with open(source, "r") as f:
|
233
|
+
if lines is not None:
|
234
|
+
file_lines = list(itertools.islice(f, lines))
|
235
|
+
content = "\n".join(line.strip() for line in file_lines)
|
236
|
+
else:
|
237
|
+
content = f.read()
|
238
|
+
soup = BeautifulSoup(content, "html.parser")
|
239
|
+
text = soup.get_text()
|
240
|
+
source_name = source if isinstance(source, str) else "bytes"
|
241
|
+
doc = Document(
|
242
|
+
content=text,
|
243
|
+
metadata=DocMetaData(source=str(source_name)),
|
244
|
+
)
|
245
|
+
return parser.split([doc])
|
246
|
+
|
124
247
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
125
248
|
"""Yield each page in the PDF."""
|
126
249
|
raise NotImplementedError
|
@@ -145,7 +268,7 @@ class DocumentParser(Parser):
|
|
145
268
|
|
146
269
|
def get_doc(self) -> Document:
|
147
270
|
"""
|
148
|
-
Get entire text from
|
271
|
+
Get entire text from source as a single document.
|
149
272
|
|
150
273
|
Returns:
|
151
274
|
a `Document` object containing the content of the pdf file,
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import json
|
2
|
-
import re
|
3
2
|
from typing import Any, Iterator, List
|
4
3
|
|
4
|
+
import yaml
|
5
5
|
from pyparsing import nestedExpr, originalTextFor
|
6
6
|
|
7
7
|
|
@@ -45,37 +45,31 @@ def get_json_candidates(s: str) -> List[str]:
|
|
45
45
|
return []
|
46
46
|
|
47
47
|
|
48
|
-
def
|
48
|
+
def add_quotes(s: str) -> str:
|
49
49
|
"""
|
50
|
-
Replace
|
50
|
+
Replace accidentally un-quoted string-like keys and values in a potential json str.
|
51
|
+
Intended to handle cases where a weak LLM may produce a JSON-like string
|
52
|
+
containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
|
53
|
+
or city: "New York" where it "forgot" to put quotes on the key.
|
54
|
+
It will even handle cases like 'address: do not know'.
|
55
|
+
|
56
|
+
Got this fiendishly clever solution from
|
57
|
+
https://stackoverflow.com/a/66053900/10940584
|
58
|
+
Far better/safer than trying to do it with regexes.
|
51
59
|
|
52
60
|
Args:
|
53
61
|
- s (str): The potential JSON string to parse.
|
54
|
-
- undefined_placeholder (str): The placeholder or error message
|
55
|
-
for undefined values.
|
56
62
|
|
57
63
|
Returns:
|
58
|
-
- str: The (potential) JSON string with
|
59
|
-
replaced by
|
64
|
+
- str: The (potential) JSON string with un-quoted string-like values
|
65
|
+
replaced by quoted values.
|
60
66
|
"""
|
61
|
-
|
62
|
-
|
63
|
-
# This regex looks for patterns like ": <identifier>" and replaces them
|
64
|
-
# with the placeholder.
|
65
|
-
# It's a simple approach and might need adjustments for complex cases
|
66
|
-
# This is an attempt to handle cases where a weak LLM may produce
|
67
|
-
# a JSON-like string without quotes around some values, e.g.
|
68
|
-
# {"rent": DO-NOT-KNOW }
|
69
|
-
preprocessed_s = re.sub(
|
70
|
-
r":\s*([a-zA-Z_][a-zA-Z_0-9\-]*)", f": {undefined_placeholder}", s
|
71
|
-
)
|
72
|
-
|
73
|
-
# Now, attempt to parse the preprocessed string as JSON
|
67
|
+
if is_valid_json(s):
|
68
|
+
return s
|
74
69
|
try:
|
75
|
-
|
70
|
+
dct = yaml.load(s, yaml.SafeLoader)
|
71
|
+
return json.dumps(dct)
|
76
72
|
except Exception:
|
77
|
-
# If parsing fails, return an error message instead
|
78
|
-
# (this should be rare after preprocessing)
|
79
73
|
return s
|
80
74
|
|
81
75
|
|
@@ -115,7 +109,7 @@ def extract_top_level_json(s: str) -> List[str]:
|
|
115
109
|
candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
|
116
110
|
for candidate in json_candidates
|
117
111
|
]
|
118
|
-
candidates = [
|
112
|
+
candidates = [add_quotes(candidate) for candidate in normalized_candidates]
|
119
113
|
candidates = [repair_newlines(candidate) for candidate in candidates]
|
120
114
|
top_level_jsons = [
|
121
115
|
candidate for candidate in candidates if is_valid_json(candidate)
|