langroid 0.33.6__py3-none-any.whl → 0.33.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +106 -0
- langroid/agent/__init__.py +41 -0
- langroid/agent/base.py +1983 -0
- langroid/agent/batch.py +398 -0
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +598 -0
- langroid/agent/chat_agent.py +1899 -0
- langroid/agent/chat_document.py +454 -0
- langroid/agent/openai_assistant.py +882 -0
- langroid/agent/special/__init__.py +59 -0
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +656 -0
- langroid/agent/special/arangodb/system_messages.py +186 -0
- langroid/agent/special/arangodb/tools.py +107 -0
- langroid/agent/special/arangodb/utils.py +36 -0
- langroid/agent/special/doc_chat_agent.py +1466 -0
- langroid/agent/special/lance_doc_chat_agent.py +262 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +198 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +82 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +260 -0
- langroid/agent/special/lance_tools.py +61 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +433 -0
- langroid/agent/special/neo4j/system_messages.py +120 -0
- langroid/agent/special/neo4j/tools.py +32 -0
- langroid/agent/special/relevance_extractor_agent.py +127 -0
- langroid/agent/special/retriever_agent.py +56 -0
- langroid/agent/special/sql/__init__.py +17 -0
- langroid/agent/special/sql/sql_chat_agent.py +654 -0
- langroid/agent/special/sql/utils/__init__.py +21 -0
- langroid/agent/special/sql/utils/description_extractors.py +190 -0
- langroid/agent/special/sql/utils/populate_metadata.py +85 -0
- langroid/agent/special/sql/utils/system_message.py +35 -0
- langroid/agent/special/sql/utils/tools.py +64 -0
- langroid/agent/special/table_chat_agent.py +263 -0
- langroid/agent/task.py +2095 -0
- langroid/agent/tool_message.py +393 -0
- langroid/agent/tools/__init__.py +38 -0
- langroid/agent/tools/duckduckgo_search_tool.py +50 -0
- langroid/agent/tools/file_tools.py +234 -0
- langroid/agent/tools/google_search_tool.py +39 -0
- langroid/agent/tools/metaphor_search_tool.py +68 -0
- langroid/agent/tools/orchestration.py +303 -0
- langroid/agent/tools/recipient_tool.py +235 -0
- langroid/agent/tools/retrieval_tool.py +32 -0
- langroid/agent/tools/rewind_tool.py +137 -0
- langroid/agent/tools/segment_extract_tool.py +41 -0
- langroid/agent/xml_tool_message.py +382 -0
- langroid/cachedb/__init__.py +17 -0
- langroid/cachedb/base.py +58 -0
- langroid/cachedb/momento_cachedb.py +108 -0
- langroid/cachedb/redis_cachedb.py +153 -0
- langroid/embedding_models/__init__.py +39 -0
- langroid/embedding_models/base.py +74 -0
- langroid/embedding_models/models.py +461 -0
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/exceptions.py +71 -0
- langroid/language_models/__init__.py +53 -0
- langroid/language_models/azure_openai.py +153 -0
- langroid/language_models/base.py +678 -0
- langroid/language_models/config.py +18 -0
- langroid/language_models/mock_lm.py +124 -0
- langroid/language_models/openai_gpt.py +1964 -0
- langroid/language_models/prompt_formatter/__init__.py +16 -0
- langroid/language_models/prompt_formatter/base.py +40 -0
- langroid/language_models/prompt_formatter/hf_formatter.py +132 -0
- langroid/language_models/prompt_formatter/llama2_formatter.py +75 -0
- langroid/language_models/utils.py +151 -0
- langroid/mytypes.py +84 -0
- langroid/parsing/__init__.py +52 -0
- langroid/parsing/agent_chats.py +38 -0
- langroid/parsing/code_parser.py +121 -0
- langroid/parsing/document_parser.py +718 -0
- langroid/parsing/para_sentence_split.py +62 -0
- langroid/parsing/parse_json.py +155 -0
- langroid/parsing/parser.py +313 -0
- langroid/parsing/repo_loader.py +790 -0
- langroid/parsing/routing.py +36 -0
- langroid/parsing/search.py +275 -0
- langroid/parsing/spider.py +102 -0
- langroid/parsing/table_loader.py +94 -0
- langroid/parsing/url_loader.py +111 -0
- langroid/parsing/urls.py +273 -0
- langroid/parsing/utils.py +373 -0
- langroid/parsing/web_search.py +156 -0
- langroid/prompts/__init__.py +9 -0
- langroid/prompts/dialog.py +17 -0
- langroid/prompts/prompts_config.py +5 -0
- langroid/prompts/templates.py +141 -0
- langroid/pydantic_v1/__init__.py +10 -0
- langroid/pydantic_v1/main.py +4 -0
- langroid/utils/__init__.py +19 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/algorithms/graph.py +103 -0
- langroid/utils/configuration.py +98 -0
- langroid/utils/constants.py +30 -0
- langroid/utils/git_utils.py +252 -0
- langroid/utils/globals.py +49 -0
- langroid/utils/logging.py +135 -0
- langroid/utils/object_registry.py +66 -0
- langroid/utils/output/__init__.py +20 -0
- langroid/utils/output/citations.py +41 -0
- langroid/utils/output/printing.py +99 -0
- langroid/utils/output/status.py +40 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +602 -0
- langroid/utils/system.py +286 -0
- langroid/utils/types.py +93 -0
- langroid/vector_store/__init__.py +50 -0
- langroid/vector_store/base.py +359 -0
- langroid/vector_store/chromadb.py +214 -0
- langroid/vector_store/lancedb.py +406 -0
- langroid/vector_store/meilisearch.py +299 -0
- langroid/vector_store/momento.py +278 -0
- langroid/vector_store/qdrantdb.py +468 -0
- {langroid-0.33.6.dist-info → langroid-0.33.7.dist-info}/METADATA +95 -94
- langroid-0.33.7.dist-info/RECORD +127 -0
- {langroid-0.33.6.dist-info → langroid-0.33.7.dist-info}/WHEEL +1 -1
- langroid-0.33.6.dist-info/RECORD +0 -7
- langroid-0.33.6.dist-info/entry_points.txt +0 -4
- pyproject.toml +0 -356
- {langroid-0.33.6.dist-info → langroid-0.33.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,121 @@
|
|
1
|
+
from functools import reduce
|
2
|
+
from typing import Callable, List
|
3
|
+
|
4
|
+
import tiktoken
|
5
|
+
from pygments import lex
|
6
|
+
from pygments.lexers import get_lexer_by_name
|
7
|
+
from pygments.token import Token
|
8
|
+
|
9
|
+
from langroid.mytypes import Document
|
10
|
+
from langroid.pydantic_v1 import BaseSettings
|
11
|
+
|
12
|
+
|
13
|
+
def chunk_code(
|
14
|
+
code: str, language: str, max_tokens: int, len_fn: Callable[[str], int]
|
15
|
+
) -> List[str]:
|
16
|
+
"""
|
17
|
+
Chunk code into smaller pieces, so that we don't exceed the maximum
|
18
|
+
number of tokens allowed by the embedding model.
|
19
|
+
Args:
|
20
|
+
code: string of code
|
21
|
+
language: str as a file extension, e.g. "py", "yml"
|
22
|
+
max_tokens: max tokens per chunk
|
23
|
+
len_fn: function to get the length of a string in token units
|
24
|
+
Returns:
|
25
|
+
|
26
|
+
"""
|
27
|
+
lexer = get_lexer_by_name(language)
|
28
|
+
tokens = list(lex(code, lexer))
|
29
|
+
|
30
|
+
chunks = []
|
31
|
+
current_chunk = ""
|
32
|
+
for token_type, token_value in tokens:
|
33
|
+
if token_type in Token.Text.Whitespace:
|
34
|
+
current_chunk += token_value
|
35
|
+
else:
|
36
|
+
token_tokens = len_fn(token_value)
|
37
|
+
if len_fn(current_chunk) + token_tokens <= max_tokens:
|
38
|
+
current_chunk += token_value
|
39
|
+
else:
|
40
|
+
chunks.append(current_chunk)
|
41
|
+
current_chunk = token_value
|
42
|
+
|
43
|
+
if current_chunk:
|
44
|
+
chunks.append(current_chunk)
|
45
|
+
|
46
|
+
return chunks
|
47
|
+
|
48
|
+
|
49
|
+
class CodeParsingConfig(BaseSettings):
|
50
|
+
extensions: List[str] = [
|
51
|
+
"py",
|
52
|
+
"java",
|
53
|
+
"c",
|
54
|
+
"cpp",
|
55
|
+
"h",
|
56
|
+
"hpp",
|
57
|
+
"yml",
|
58
|
+
"yaml",
|
59
|
+
"toml",
|
60
|
+
"cfg", # e.g. setup.cfg
|
61
|
+
"ini",
|
62
|
+
"json",
|
63
|
+
"rst",
|
64
|
+
"sh",
|
65
|
+
"bash",
|
66
|
+
]
|
67
|
+
chunk_size: int = 500 # tokens
|
68
|
+
token_encoding_model: str = "text-embedding-ada-002"
|
69
|
+
n_similar_docs: int = 4
|
70
|
+
|
71
|
+
|
72
|
+
class CodeParser:
|
73
|
+
def __init__(self, config: CodeParsingConfig):
|
74
|
+
self.config = config
|
75
|
+
self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
|
76
|
+
|
77
|
+
def num_tokens(self, text: str) -> int:
|
78
|
+
"""
|
79
|
+
How many tokens are in the text, according to the tokenizer.
|
80
|
+
This needs to be accurate, otherwise we may exceed the maximum
|
81
|
+
number of tokens allowed by the model.
|
82
|
+
Args:
|
83
|
+
text: string to tokenize
|
84
|
+
Returns:
|
85
|
+
number of tokens in the text
|
86
|
+
"""
|
87
|
+
tokens = self.tokenizer.encode(text)
|
88
|
+
return len(tokens)
|
89
|
+
|
90
|
+
def split(self, docs: List[Document]) -> List[Document]:
|
91
|
+
"""
|
92
|
+
Split the documents into chunks, according to the config.splitter.
|
93
|
+
Only the documents with a language in the config.extensions are split.
|
94
|
+
!!! note
|
95
|
+
We assume the metadata in each document has at least a `language` field,
|
96
|
+
which is used to determine how to chunk the code.
|
97
|
+
Args:
|
98
|
+
docs: list of documents to split
|
99
|
+
Returns:
|
100
|
+
list of documents, where each document is a chunk; the metadata of the
|
101
|
+
original document is duplicated for each chunk, so that when we retrieve a
|
102
|
+
chunk, we immediately know info about the original document.
|
103
|
+
"""
|
104
|
+
chunked_docs = [
|
105
|
+
[
|
106
|
+
Document(content=chunk, metadata=d.metadata)
|
107
|
+
for chunk in chunk_code(
|
108
|
+
d.content,
|
109
|
+
d.metadata.language, # type: ignore
|
110
|
+
self.config.chunk_size,
|
111
|
+
self.num_tokens,
|
112
|
+
)
|
113
|
+
if chunk.strip() != ""
|
114
|
+
]
|
115
|
+
for d in docs
|
116
|
+
if d.metadata.language in self.config.extensions # type: ignore
|
117
|
+
]
|
118
|
+
if len(chunked_docs) == 0:
|
119
|
+
return []
|
120
|
+
# collapse the list of lists into a single list
|
121
|
+
return reduce(lambda x, y: x + y, chunked_docs)
|