langroid 0.1.196__tar.gz → 0.1.198__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {langroid-0.1.196 → langroid-0.1.198}/PKG-INFO +6 -5
  2. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/chat_agent.py +10 -2
  3. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/chat_document.py +1 -1
  4. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/doc_chat_agent.py +2 -2
  5. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/task.py +7 -3
  6. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tool_message.py +31 -6
  7. {langroid-0.1.196 → langroid-0.1.198}/langroid/embedding_models/models.py +53 -14
  8. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/prompt_formatter/hf_formatter.py +28 -2
  9. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/document_parser.py +34 -53
  10. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/json.py +59 -2
  11. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/parser.py +1 -3
  12. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/pydantic_utils.py +47 -0
  13. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/system.py +35 -0
  14. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/chromadb.py +10 -4
  15. {langroid-0.1.196 → langroid-0.1.198}/pyproject.toml +7 -5
  16. {langroid-0.1.196 → langroid-0.1.198}/LICENSE +0 -0
  17. {langroid-0.1.196 → langroid-0.1.198}/README.md +0 -0
  18. {langroid-0.1.196 → langroid-0.1.198}/langroid/__init__.py +0 -0
  19. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/__init__.py +0 -0
  20. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/base.py +0 -0
  21. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/batch.py +0 -0
  22. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/callbacks/__init__.py +0 -0
  23. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/callbacks/chainlit.py +0 -0
  24. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/helpers.py +0 -0
  25. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/junk +0 -0
  26. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/openai_assistant.py +0 -0
  27. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/__init__.py +0 -0
  28. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  29. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/lance_rag/__init__.py +0 -0
  30. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  31. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  32. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/lance_rag/lance_tools.py +0 -0
  33. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  34. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/neo4j/__init__.py +0 -0
  35. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  36. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  37. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/neo4j/utils/__init__.py +0 -0
  38. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/neo4j/utils/system_message.py +0 -0
  39. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  40. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/retriever_agent.py +0 -0
  41. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/__init__.py +0 -0
  42. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  43. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/utils/__init__.py +0 -0
  44. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  45. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  46. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/utils/system_message.py +0 -0
  47. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/sql/utils/tools.py +0 -0
  48. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/special/table_chat_agent.py +0 -0
  49. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/__init__.py +0 -0
  50. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  51. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/extract_tool.py +0 -0
  52. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/generator_tool.py +0 -0
  53. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/google_search_tool.py +0 -0
  54. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  55. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/recipient_tool.py +0 -0
  56. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/run_python_code.py +0 -0
  57. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/sciphi_search_rag_tool.py +0 -0
  58. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent/tools/segment_extract_tool.py +0 -0
  59. {langroid-0.1.196 → langroid-0.1.198}/langroid/agent_config.py +0 -0
  60. {langroid-0.1.196 → langroid-0.1.198}/langroid/cachedb/__init__.py +0 -0
  61. {langroid-0.1.196 → langroid-0.1.198}/langroid/cachedb/base.py +0 -0
  62. {langroid-0.1.196 → langroid-0.1.198}/langroid/cachedb/momento_cachedb.py +0 -0
  63. {langroid-0.1.196 → langroid-0.1.198}/langroid/cachedb/redis_cachedb.py +0 -0
  64. {langroid-0.1.196 → langroid-0.1.198}/langroid/embedding_models/__init__.py +0 -0
  65. {langroid-0.1.196 → langroid-0.1.198}/langroid/embedding_models/base.py +0 -0
  66. {langroid-0.1.196 → langroid-0.1.198}/langroid/embedding_models/clustering.py +0 -0
  67. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/__init__.py +0 -0
  68. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/azure_openai.py +0 -0
  69. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/base.py +0 -0
  70. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/config.py +0 -0
  71. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/openai_assistants.py +0 -0
  72. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/openai_gpt.py +0 -0
  73. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  74. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/prompt_formatter/base.py +0 -0
  75. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  76. {langroid-0.1.196 → langroid-0.1.198}/langroid/language_models/utils.py +0 -0
  77. {langroid-0.1.196 → langroid-0.1.198}/langroid/mytypes.py +0 -0
  78. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/__init__.py +0 -0
  79. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/agent_chats.py +0 -0
  80. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/code-parsing.md +0 -0
  81. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/code_parser.py +0 -0
  82. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/config.py +0 -0
  83. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/para_sentence_split.py +0 -0
  84. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/repo_loader.py +0 -0
  85. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/search.py +0 -0
  86. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/spider.py +0 -0
  87. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/table_loader.py +0 -0
  88. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/url_loader.py +0 -0
  89. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/url_loader_cookies.py +0 -0
  90. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/urls.py +0 -0
  91. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/utils.py +0 -0
  92. {langroid-0.1.196 → langroid-0.1.198}/langroid/parsing/web_search.py +0 -0
  93. {langroid-0.1.196 → langroid-0.1.198}/langroid/prompts/__init__.py +0 -0
  94. {langroid-0.1.196 → langroid-0.1.198}/langroid/prompts/chat-gpt4-system-prompt.md +0 -0
  95. {langroid-0.1.196 → langroid-0.1.198}/langroid/prompts/dialog.py +0 -0
  96. {langroid-0.1.196 → langroid-0.1.198}/langroid/prompts/prompts_config.py +0 -0
  97. {langroid-0.1.196 → langroid-0.1.198}/langroid/prompts/templates.py +0 -0
  98. {langroid-0.1.196 → langroid-0.1.198}/langroid/prompts/transforms.py +0 -0
  99. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/__init__.py +0 -0
  100. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/algorithms/__init__.py +0 -0
  101. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/algorithms/graph.py +0 -0
  102. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/configuration.py +0 -0
  103. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/constants.py +0 -0
  104. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/docker.py +0 -0
  105. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/globals.py +0 -0
  106. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/llms/__init__.py +0 -0
  107. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/llms/strings.py +0 -0
  108. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/logging.py +0 -0
  109. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/output/__init__.py +0 -0
  110. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/output/printing.py +0 -0
  111. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/pandas_utils.py +0 -0
  112. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/web/__init__.py +0 -0
  113. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/web/login.py +0 -0
  114. {langroid-0.1.196 → langroid-0.1.198}/langroid/utils/web/selenium_login.py +0 -0
  115. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/__init__.py +0 -0
  116. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/base.py +0 -0
  117. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/lancedb.py +0 -0
  118. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/meilisearch.py +0 -0
  119. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/momento.py +0 -0
  120. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/qdrant_cloud.py +0 -0
  121. {langroid-0.1.196 → langroid-0.1.198}/langroid/vector_store/qdrantdb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.196
3
+ Version: 0.1.198
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -18,20 +18,21 @@ Provides-Extra: mysql
18
18
  Provides-Extra: neo4j
19
19
  Provides-Extra: postgres
20
20
  Provides-Extra: sciphi
21
+ Provides-Extra: transformers
22
+ Provides-Extra: unstructured
21
23
  Requires-Dist: agent-search (>=0.0.7,<0.0.8) ; extra == "sciphi"
22
24
  Requires-Dist: aiohttp (>=3.9.1,<4.0.0)
23
25
  Requires-Dist: async-generator (>=1.10,<2.0)
24
26
  Requires-Dist: autopep8 (>=2.0.2,<3.0.0)
25
27
  Requires-Dist: black[jupyter] (>=23.3.0,<24.0.0)
26
28
  Requires-Dist: bs4 (>=0.0.1,<0.0.2)
27
- Requires-Dist: chainlit (>=1.0.200,<2.0.0) ; extra == "chainlit"
28
- Requires-Dist: chromadb (==0.3.21)
29
+ Requires-Dist: chainlit (>=1.0.301,<2.0.0) ; extra == "chainlit"
30
+ Requires-Dist: chromadb (>=0.4.21,<=0.4.23)
29
31
  Requires-Dist: colorlog (>=6.7.0,<7.0.0)
30
32
  Requires-Dist: docstring-parser (>=0.15,<0.16)
31
33
  Requires-Dist: duckduckgo-search (>=4.4,<5.0)
32
34
  Requires-Dist: faker (>=18.9.0,<19.0.0)
33
35
  Requires-Dist: fakeredis (>=2.12.1,<3.0.0)
34
- Requires-Dist: farm-haystack[file-conversion,ocr,pdf,preprocessing] (>=1.21.1,<2.0.0)
35
36
  Requires-Dist: fire (>=0.5.0,<0.6.0)
36
37
  Requires-Dist: flake8 (>=6.0.0,<7.0.0)
37
38
  Requires-Dist: google-api-python-client (>=2.95.0,<3.0.0)
@@ -95,7 +96,7 @@ Requires-Dist: trafilatura (>=1.5.0,<2.0.0)
95
96
  Requires-Dist: typer (>=0.9.0,<0.10.0)
96
97
  Requires-Dist: types-redis (>=4.5.5.2,<5.0.0.0)
97
98
  Requires-Dist: types-requests (>=2.31.0.1,<3.0.0.0)
98
- Requires-Dist: unstructured[docx,pdf,pptx] (>=0.10.16,<0.10.18)
99
+ Requires-Dist: unstructured[docx,pdf,pptx] (>=0.10.16,<0.10.18) ; extra == "unstructured"
99
100
  Requires-Dist: wget (>=3.2,<4.0)
100
101
  Description-Content-Type: text/markdown
101
102
 
@@ -225,14 +225,22 @@ class ChatAgent(Agent):
225
225
  enabled_classes: List[Type[ToolMessage]] = list(self.llm_tools_map.values())
226
226
  if len(enabled_classes) == 0:
227
227
  return "You can ask questions in natural language."
228
-
229
228
  json_instructions = "\n\n".join(
230
229
  [
231
- msg_cls.json_instructions()
230
+ msg_cls.json_instructions(tool=self.config.use_tools)
232
231
  for _, msg_cls in enumerate(enabled_classes)
233
232
  if msg_cls.default_value("request") in self.llm_tools_usable
234
233
  ]
235
234
  )
235
+ # if any of the enabled classes has json_group_instructions, then use that,
236
+ # else fall back to ToolMessage.json_group_instructions
237
+ for msg_cls in enabled_classes:
238
+ if hasattr(msg_cls, "json_group_instructions") and callable(
239
+ getattr(msg_cls, "json_group_instructions")
240
+ ):
241
+ return msg_cls.json_group_instructions().format(
242
+ json_instructions=json_instructions
243
+ )
236
244
  return ToolMessage.json_group_instructions().format(
237
245
  json_instructions=json_instructions
238
246
  )
@@ -84,7 +84,7 @@ class ChatDocument(Document):
84
84
  json_data = json.loads(j)
85
85
  tool = json_data.get("request")
86
86
  if tool is not None:
87
- tools.append(tool)
87
+ tools.append(str(tool))
88
88
  return tools
89
89
 
90
90
  def log_fields(self) -> ChatDocLoggerFields:
@@ -135,7 +135,7 @@ class DocChatAgentConfig(ChatAgentConfig):
135
135
  # NOTE: PDF parsing is extremely challenging, and each library
136
136
  # has its own strengths and weaknesses.
137
137
  # Try one that works for your use case.
138
- # or "haystack", "unstructured", "pdfplumber", "fitz", "pypdf"
138
+ # or "unstructured", "pdfplumber", "fitz", "pypdf"
139
139
  library="pdfplumber",
140
140
  ),
141
141
  )
@@ -156,7 +156,7 @@ class DocChatAgentConfig(ChatAgentConfig):
156
156
  collection_name="doc-chat-lancedb",
157
157
  replace_collection=True,
158
158
  storage_path=".lancedb/data/",
159
- embedding=hf_embed_config,
159
+ embedding=oai_embed_config,
160
160
  )
161
161
  llm: OpenAIGPTConfig = OpenAIGPTConfig(
162
162
  type="openai",
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import copy
4
4
  import logging
5
+ import re
5
6
  from collections import Counter
6
7
  from types import SimpleNamespace
7
8
  from typing import (
@@ -781,17 +782,20 @@ class Task:
781
782
  # handle routing instruction in result if any,
782
783
  # of the form PASS=<recipient>
783
784
  content = msg.content if isinstance(msg, ChatDocument) else msg
785
+ content = content.strip()
784
786
  if PASS in content and PASS_TO not in content:
785
787
  return True, None
786
788
  if PASS_TO in content and content.split(":")[1] != "":
787
789
  return True, content.split(":")[1]
788
- if SEND_TO in content and content.split(":")[1] != "":
789
- recipient = content.split(":")[1]
790
+ if SEND_TO in content and (send_parts := re.split(r"[,: ]", content))[1] != "":
791
+ # assume syntax is SEND_TO:<recipient> <content>
792
+ # or SEND_TO:<recipient>,<content> or SEND_TO:<recipient>:<content>
793
+ recipient = send_parts[1].strip()
790
794
  # get content to send, clean out routing instruction, and
791
795
  # start from 1 char after SEND_TO:<recipient>,
792
796
  # because we expect there is either a blank or some other separator
793
797
  # after the recipient
794
- content_to_send = content.replace(f"{SEND_TO}:{recipient}", "").strip()[1:]
798
+ content_to_send = content.replace(f"{SEND_TO}{recipient}", "").strip()[1:]
795
799
  # if no content then treat same as PASS_TO
796
800
  if content_to_send == "":
797
801
  return True, recipient
@@ -16,7 +16,10 @@ from docstring_parser import parse
16
16
  from pydantic import BaseModel
17
17
 
18
18
  from langroid.language_models.base import LLMFunctionSpec
19
- from langroid.utils.pydantic_utils import _recursive_purge_dict_key
19
+ from langroid.utils.pydantic_utils import (
20
+ _recursive_purge_dict_key,
21
+ generate_simple_schema,
22
+ )
20
23
 
21
24
 
22
25
  class ToolMessage(ABC, BaseModel):
@@ -79,6 +82,9 @@ class ToolMessage(ABC, BaseModel):
79
82
  ex = choice(cls.examples())
80
83
  return ex.json_example()
81
84
 
85
+ def to_json(self) -> str:
86
+ return self.json(indent=4, exclude={"result", "purpose"})
87
+
82
88
  def json_example(self) -> str:
83
89
  return self.json(indent=4, exclude={"result", "purpose"})
84
90
 
@@ -101,22 +107,30 @@ class ToolMessage(ABC, BaseModel):
101
107
  return properties.get(f, {}).get("default", None)
102
108
 
103
109
  @classmethod
104
- def json_instructions(cls) -> str:
110
+ def json_instructions(cls, tool: bool = False) -> str:
105
111
  """
106
112
  Default Instructions to the LLM showing how to use the tool/function-call.
107
113
  Works for GPT4 but override this for weaker LLMs if needed.
114
+
115
+ Args:
116
+ tool: instructions for Langroid-native tool use? (e.g. for non-OpenAI LLM)
117
+ (or else it would be for OpenAI Function calls)
108
118
  Returns:
109
119
  str: instructions on how to use the message
110
120
  """
121
+ # TODO: when we attempt to use a "simpler schema"
122
+ # (i.e. all nested fields explicit without definitions),
123
+ # we seem to get worse results, so we turn it off for now
124
+ param_dict = (
125
+ # cls.simple_schema() if tool else
126
+ cls.llm_function_schema(request=True).parameters
127
+ )
111
128
  return textwrap.dedent(
112
129
  f"""
113
130
  TOOL: {cls.default_value("request")}
114
131
  PURPOSE: {cls.default_value("purpose")}
115
132
  JSON FORMAT: {
116
- json.dumps(
117
- cls.llm_function_schema(request=True).parameters,
118
- indent=4,
119
- )
133
+ json.dumps(param_dict, indent=4)
120
134
  }
121
135
  {"EXAMPLE: " + cls.usage_example() if cls.examples() else ""}
122
136
  """.lstrip()
@@ -210,3 +224,14 @@ class ToolMessage(ABC, BaseModel):
210
224
  description=cls.default_value("purpose"),
211
225
  parameters=parameters,
212
226
  )
227
+
228
+ @classmethod
229
+ def simple_schema(cls) -> Dict[str, Any]:
230
+ """
231
+ Return a simplified schema for the message, with only the request and
232
+ required fields.
233
+ Returns:
234
+ Dict[str, Any]: simplified schema
235
+ """
236
+ schema = generate_simple_schema(cls, exclude=["result", "purpose"])
237
+ return schema
@@ -6,7 +6,6 @@ from dotenv import load_dotenv
6
6
  from openai import OpenAI
7
7
 
8
8
  from langroid.embedding_models.base import EmbeddingModel, EmbeddingModelsConfig
9
- from langroid.language_models.utils import retry_with_exponential_backoff
10
9
  from langroid.mytypes import Embeddings
11
10
  from langroid.parsing.utils import batched
12
11
 
@@ -26,6 +25,58 @@ class SentenceTransformerEmbeddingsConfig(EmbeddingModelsConfig):
26
25
  context_length: int = 512
27
26
 
28
27
 
28
+ class EmbeddingFunctionCallable:
29
+ """
30
+ A callable class designed to generate embeddings for a list of texts using
31
+ the OpenAI API, with automatic retries on failure.
32
+
33
+ Attributes:
34
+ model (OpenAIEmbeddings): An instance of OpenAIEmbeddings that provides
35
+ configuration and utilities for generating embeddings.
36
+
37
+ Methods:
38
+ __call__(input: List[str]) -> Embeddings: Generate embeddings for
39
+ a list of input texts.
40
+ """
41
+
42
+ def __init__(self, model: "OpenAIEmbeddings"):
43
+ """
44
+ Initialize the EmbeddingFunctionCallable with a specific model.
45
+
46
+ Args:
47
+ model (OpenAIEmbeddings): An instance of OpenAIEmbeddings to use for
48
+ generating embeddings.
49
+ """
50
+ self.model = model
51
+
52
+ def __call__(self, input: List[str]) -> Embeddings:
53
+ """
54
+ Generate embeddings for a given list of input texts using the OpenAI API,
55
+ with retries on failure.
56
+
57
+ This method:
58
+ - Truncates each text in the input list to the model's maximum context length.
59
+ - Processes the texts in batches to generate embeddings efficiently.
60
+ - Automatically retries the embedding generation process with exponential
61
+ backoff in case of failures.
62
+
63
+ Args:
64
+ input (List[str]): A list of input texts to generate embeddings for.
65
+
66
+ Returns:
67
+ Embeddings: A list of embedding vectors corresponding to the input texts.
68
+ """
69
+ tokenized_texts = self.model.truncate_texts(input)
70
+ embeds = []
71
+ for batch in batched(tokenized_texts, 500):
72
+ result = self.model.client.embeddings.create(
73
+ input=batch, model=self.model.config.model_name
74
+ )
75
+ batch_embeds = [d.embedding for d in result.data]
76
+ embeds.extend(batch_embeds)
77
+ return embeds
78
+
79
+
29
80
  class OpenAIEmbeddings(EmbeddingModel):
30
81
  def __init__(self, config: OpenAIEmbeddingsConfig = OpenAIEmbeddingsConfig()):
31
82
  super().__init__()
@@ -56,19 +107,7 @@ class OpenAIEmbeddings(EmbeddingModel):
56
107
  ]
57
108
 
58
109
  def embedding_fn(self) -> Callable[[List[str]], Embeddings]:
59
- @retry_with_exponential_backoff
60
- def fn(texts: List[str]) -> Embeddings:
61
- tokenized_texts = self.truncate_texts(texts)
62
- embeds = []
63
- for batch in batched(tokenized_texts, 500):
64
- result = self.client.embeddings.create(
65
- input=batch, model=self.config.model_name
66
- )
67
- batch_embeds = [d.embedding for d in result.data]
68
- embeds.extend(batch_embeds)
69
- return embeds
70
-
71
- return fn
110
+ return EmbeddingFunctionCallable(self)
72
111
 
73
112
  @property
74
113
  def embedding_dims(self) -> int:
@@ -6,11 +6,10 @@ models will have the same tokenizer, so we just use the first one.
6
6
  """
7
7
  import logging
8
8
  import re
9
- from typing import List, Set
9
+ from typing import Any, List, Set, Type
10
10
 
11
11
  from huggingface_hub import HfApi, ModelFilter
12
12
  from jinja2.exceptions import TemplateError
13
- from transformers import AutoTokenizer
14
13
 
15
14
  from langroid.language_models.base import LanguageModel, LLMMessage, Role
16
15
  from langroid.language_models.config import HFPromptFormatterConfig
@@ -19,6 +18,31 @@ from langroid.language_models.prompt_formatter.base import PromptFormatter
19
18
  logger = logging.getLogger(__name__)
20
19
 
21
20
 
21
+ def try_import_AutoTokenizer() -> Type[Any]:
22
+ """
23
+ Attempts to import the AutoTokenizer class from the transformers package.
24
+ Returns:
25
+ The AutoTokenizer class if successful.
26
+ Raises:
27
+ ImportError: If the transformers package is not installed.
28
+ """
29
+ try:
30
+ from transformers import AutoTokenizer
31
+
32
+ return AutoTokenizer # type: ignore
33
+ except ImportError:
34
+ raise ImportError(
35
+ """
36
+ You are trying to use the HuggingFace transformers.AutoTokenizer,
37
+ but the `transformers` package is not installed
38
+ by default with Langroid. Please install langroid using the
39
+ `transformers` extra, like so:
40
+ pip install "langroid[transformers]"
41
+ or equivalent.
42
+ """
43
+ )
44
+
45
+
22
46
  def find_hf_formatter(model_name: str) -> str:
23
47
  hf_api = HfApi()
24
48
  # try to find a matching model, with progressivly shorter prefixes of model_name
@@ -37,6 +61,7 @@ def find_hf_formatter(model_name: str) -> str:
37
61
  mdl = next(models)
38
62
  except StopIteration:
39
63
  continue
64
+ AutoTokenizer = try_import_AutoTokenizer()
40
65
  tokenizer = AutoTokenizer.from_pretrained(mdl.id)
41
66
  if tokenizer.chat_template is not None:
42
67
  return str(mdl.id)
@@ -60,6 +85,7 @@ class HFFormatter(PromptFormatter):
60
85
  mdl = next(models)
61
86
  except StopIteration:
62
87
  raise ValueError(f"Model {config.model_name} not found on HuggingFace Hub")
88
+ AutoTokenizer = try_import_AutoTokenizer()
63
89
  self.tokenizer = AutoTokenizer.from_pretrained(mdl.id)
64
90
  if self.tokenizer.chat_template is None:
65
91
  raise ValueError(
@@ -11,7 +11,6 @@ import requests
11
11
 
12
12
  from langroid.mytypes import DocMetaData, Document
13
13
  from langroid.parsing.parser import Parser, ParsingConfig
14
- from langroid.parsing.urls import url_to_tempfile
15
14
 
16
15
  logger = logging.getLogger(__name__)
17
16
 
@@ -54,8 +53,6 @@ class DocumentParser(Parser):
54
53
  return PDFPlumberParser(source, config)
55
54
  elif config.pdf.library == "unstructured":
56
55
  return UnstructuredPDFParser(source, config)
57
- elif config.pdf.library == "haystack":
58
- return HaystackPDFParser(source, config)
59
56
  else:
60
57
  raise ValueError(
61
58
  f"Unsupported PDF library specified: {config.pdf.library}"
@@ -301,59 +298,23 @@ class PDFPlumberParser(DocumentParser):
301
298
  return self.fix_text(page.extract_text())
302
299
 
303
300
 
304
- class HaystackPDFParser(DocumentParser):
305
- """
306
- Parser for processing PDFs using the `haystack` library.
307
- """
308
-
309
- def get_doc_chunks(self) -> List[Document]:
310
- """
311
- Overrides the base class method to use the `haystack` library.
312
- See there for more details.
313
- """
314
-
315
- from haystack.nodes import PDFToTextConverter, PreProcessor
316
-
317
- converter = PDFToTextConverter(
318
- remove_numeric_tables=True,
319
- )
320
- path = self.source
321
- if path.startswith(("http://", "https://")):
322
- path = url_to_tempfile(path)
323
- doc = converter.convert(file_path=path, meta=None)
324
- # note self.config.chunk_size is in token units,
325
- # and we use an approximation of 75 words per 100 tokens
326
- # to convert to word units
327
- preprocessor = PreProcessor(
328
- clean_empty_lines=True,
329
- clean_whitespace=True,
330
- clean_header_footer=False,
331
- split_by="word",
332
- split_length=int(0.75 * self.config.chunk_size),
333
- split_overlap=int(0.75 * self.config.overlap),
334
- split_respect_sentence_boundary=True,
335
- add_page_number=True,
336
- )
337
- chunks = preprocessor.process(doc)
338
- return [
339
- Document(
340
- content=chunk.content,
341
- metadata=DocMetaData(
342
- source=f"{self.source} page {chunk.meta['page']}",
343
- is_chunk=True,
344
- ),
345
- )
346
- for chunk in chunks
347
- ]
348
-
349
-
350
301
  class UnstructuredPDFParser(DocumentParser):
351
302
  """
352
303
  Parser for processing PDF files using the `unstructured` library.
353
304
  """
354
305
 
355
306
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
356
- from unstructured.partition.pdf import partition_pdf
307
+ try:
308
+ from unstructured.partition.pdf import partition_pdf
309
+ except ImportError:
310
+ raise ImportError(
311
+ """
312
+ The `unstructured` library is not installed by default with langroid.
313
+ To include this library, please install langroid with the
314
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
315
+ or equivalent.
316
+ """
317
+ )
357
318
 
358
319
  # from unstructured.chunking.title import chunk_by_title
359
320
 
@@ -367,7 +328,7 @@ class UnstructuredPDFParser(DocumentParser):
367
328
  Please try a different library by setting the `library` field
368
329
  in the `pdf` section of the `parsing` field in the config file.
369
330
  Supported libraries are:
370
- fitz, pypdf, pdfplumber, unstructured, haystack
331
+ fitz, pypdf, pdfplumber, unstructured
371
332
  """
372
333
  )
373
334
 
@@ -406,7 +367,17 @@ class UnstructuredDocxParser(DocumentParser):
406
367
  """
407
368
 
408
369
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
409
- from unstructured.partition.docx import partition_docx
370
+ try:
371
+ from unstructured.partition.docx import partition_docx
372
+ except ImportError:
373
+ raise ImportError(
374
+ """
375
+ The `unstructured` library is not installed by default with langroid.
376
+ To include this library, please install langroid with the
377
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
378
+ or equivalent.
379
+ """
380
+ )
410
381
 
411
382
  elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
412
383
 
@@ -447,7 +418,17 @@ class UnstructuredDocxParser(DocumentParser):
447
418
 
448
419
  class UnstructuredDocParser(UnstructuredDocxParser):
449
420
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
450
- from unstructured.partition.doc import partition_doc
421
+ try:
422
+ from unstructured.partition.doc import partition_doc
423
+ except ImportError:
424
+ raise ImportError(
425
+ """
426
+ The `unstructured` library is not installed by default with langroid.
427
+ To include this library, please install langroid with the
428
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
429
+ or equivalent.
430
+ """
431
+ )
451
432
 
452
433
  elements = partition_doc(filename=self.source, include_page_breaks=True)
453
434
 
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
  from typing import Any, Iterator, List
3
4
 
4
5
  from pyparsing import nestedExpr, originalTextFor
@@ -44,6 +45,60 @@ def get_json_candidates(s: str) -> List[str]:
44
45
  return []
45
46
 
46
47
 
48
+ def replace_undefined(s: str, undefined_placeholder: str = '"<undefined>"') -> str:
49
+ """
50
+ Replace undefined values in a potential json str with a placeholder.
51
+
52
+ Args:
53
+ - s (str): The potential JSON string to parse.
54
+ - undefined_placeholder (str): The placeholder or error message
55
+ for undefined values.
56
+
57
+ Returns:
58
+ - str: The (potential) JSON string with undefined values
59
+ replaced by the placeholder.
60
+ """
61
+
62
+ # Preprocess the string to replace undefined values with the placeholder
63
+ # This regex looks for patterns like ": <identifier>" and replaces them
64
+ # with the placeholder.
65
+ # It's a simple approach and might need adjustments for complex cases
66
+ # This is an attempt to handle cases where a weak LLM may produce
67
+ # a JSON-like string without quotes around some values, e.g.
68
+ # {"rent": DO-NOT-KNOW }
69
+ preprocessed_s = re.sub(
70
+ r":\s*([a-zA-Z_][a-zA-Z_0-9\-]*)", f": {undefined_placeholder}", s
71
+ )
72
+
73
+ # Now, attempt to parse the preprocessed string as JSON
74
+ try:
75
+ return preprocessed_s
76
+ except Exception:
77
+ # If parsing fails, return an error message instead
78
+ # (this should be rare after preprocessing)
79
+ return s
80
+
81
+
82
+ def repair_newlines(s: str) -> str:
83
+ """
84
+ Attempt to load as json, and if it fails, try with newlines replaced by space.
85
+ Intended to handle cases where weak LLMs produce JSON-like strings where
86
+ some string-values contain explicit newlines, e.g.:
87
+ {"text": "This is a text\n with a newline"}
88
+ These would not be valid JSON, so we try to clean them up here.
89
+ """
90
+ try:
91
+ json.loads(s)
92
+ return s
93
+ except Exception:
94
+ try:
95
+ s = s.replace("\n", " ")
96
+ json.loads(s)
97
+ return s
98
+ except Exception:
99
+ return s
100
+
101
+
47
102
  def extract_top_level_json(s: str) -> List[str]:
48
103
  """Extract all top-level JSON-formatted substrings from a given string.
49
104
 
@@ -53,15 +108,17 @@ def extract_top_level_json(s: str) -> List[str]:
53
108
  Returns:
54
109
  List[str]: A list of top-level JSON-formatted substrings.
55
110
  """
56
- # Find JSON object and array candidates using regular expressions
111
+ # Find JSON object and array candidates
57
112
  json_candidates = get_json_candidates(s)
58
113
 
59
114
  normalized_candidates = [
60
115
  candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
61
116
  for candidate in json_candidates
62
117
  ]
118
+ candidates = [replace_undefined(candidate) for candidate in normalized_candidates]
119
+ candidates = [repair_newlines(candidate) for candidate in candidates]
63
120
  top_level_jsons = [
64
- candidate for candidate in normalized_candidates if is_valid_json(candidate)
121
+ candidate for candidate in candidates if is_valid_json(candidate)
65
122
  ]
66
123
 
67
124
  return top_level_jsons
@@ -19,9 +19,7 @@ class Splitter(str, Enum):
19
19
 
20
20
 
21
21
  class PdfParsingConfig(BaseSettings):
22
- library: Literal[
23
- "fitz", "pdfplumber", "pypdf", "unstructured", "haystack"
24
- ] = "pdfplumber"
22
+ library: Literal["fitz", "pdfplumber", "pypdf", "unstructured"] = "pdfplumber"
25
23
 
26
24
 
27
25
  class DocxParsingConfig(BaseSettings):
@@ -135,6 +135,53 @@ def flatten_pydantic_model(
135
135
  return create_model("FlatModel", __base__=base_model, **flattened_fields)
136
136
 
137
137
 
138
+ def get_field_names(model: Type[BaseModel]) -> List[str]:
139
+ """Get all field names from a possibly nested Pydantic model."""
140
+ mdl = flatten_pydantic_model(model)
141
+ fields = list(mdl.__fields__.keys())
142
+ # fields may be like a__b__c , so we only want the last part
143
+ return [f.split("__")[-1] for f in fields]
144
+
145
+
146
+ def generate_simple_schema(
147
+ model: Type[BaseModel], exclude: List[str] = []
148
+ ) -> Dict[str, Any]:
149
+ """
150
+ Generates a JSON schema for a Pydantic model,
151
+ with options to exclude specific fields.
152
+
153
+ This function traverses the Pydantic model's fields, including nested models,
154
+ to generate a dictionary representing the JSON schema. Fields specified in
155
+ the exclude list will not be included in the generated schema.
156
+
157
+ Args:
158
+ model (Type[BaseModel]): The Pydantic model class to generate the schema for.
159
+ exclude (List[str]): A list of string field names to be excluded from the
160
+ generated schema. Defaults to an empty list.
161
+
162
+ Returns:
163
+ Dict[str, Any]: A dictionary representing the JSON schema of the provided model,
164
+ with specified fields excluded.
165
+ """
166
+ if hasattr(model, "__fields__"):
167
+ output: Dict[str, Any] = {}
168
+ for field_name, field in model.__fields__.items():
169
+ if field_name in exclude:
170
+ continue # Skip excluded fields
171
+
172
+ field_type = field.type_
173
+ if issubclass(field_type, BaseModel):
174
+ # Recursively generate schema for nested models
175
+ output[field_name] = generate_simple_schema(field_type, exclude)
176
+ else:
177
+ # Represent the type as a string here
178
+ output[field_name] = {"type": field_type.__name__}
179
+ return output
180
+ else:
181
+ # Non-model type, return a simplified representation
182
+ return {"type": model.__name__}
183
+
184
+
138
185
  def flatten_pydantic_instance(
139
186
  instance: BaseModel,
140
187
  prefix: str = "",
@@ -1,10 +1,12 @@
1
1
  import getpass
2
2
  import hashlib
3
+ import importlib
3
4
  import inspect
4
5
  import logging
5
6
  import shutil
6
7
  import socket
7
8
  import traceback
9
+ from typing import Any
8
10
 
9
11
  logger = logging.getLogger(__name__)
10
12
 
@@ -15,6 +17,39 @@ DELETION_ALLOWED_PATHS = [
15
17
  ]
16
18
 
17
19
 
20
+ class LazyLoad:
21
+ """Lazy loading of modules or classes."""
22
+
23
+ def __init__(self, import_path: str) -> None:
24
+ self.import_path = import_path
25
+ self._target = None
26
+ self._is_target_loaded = False
27
+
28
+ def _load_target(self) -> None:
29
+ if not self._is_target_loaded:
30
+ try:
31
+ # Attempt to import as a module
32
+ self._target = importlib.import_module(self.import_path) # type: ignore
33
+ except ImportError:
34
+ # If module import fails, attempt to import as a
35
+ # class or function from a module
36
+ module_path, attr_name = self.import_path.rsplit(".", 1)
37
+ module = importlib.import_module(module_path)
38
+ self._target = getattr(module, attr_name)
39
+ self._is_target_loaded = True
40
+
41
+ def __getattr__(self, name: str) -> Any:
42
+ self._load_target()
43
+ return getattr(self._target, name)
44
+
45
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
46
+ self._load_target()
47
+ if callable(self._target):
48
+ return self._target(*args, **kwargs)
49
+ else:
50
+ raise TypeError(f"{self.import_path!r} object is not callable")
51
+
52
+
18
53
  def rmdir(path: str) -> bool:
19
54
  """
20
55
  Remove a directory recursively.
@@ -141,10 +141,16 @@ class ChromaDB(VectorStore):
141
141
  return self._docs_from_results(results)
142
142
 
143
143
  def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
144
- results = self.collection.get(ids=ids, include=["documents", "metadatas"])
145
- results["documents"] = [results["documents"]]
146
- results["metadatas"] = [results["metadatas"]]
147
- return self._docs_from_results(results)
144
+ # get them one by one since chroma mangles the order of the results
145
+ # when fetched from a list of ids.
146
+ results = [
147
+ self.collection.get(ids=[id], include=["documents", "metadatas"])
148
+ for id in ids
149
+ ]
150
+ final_results = {}
151
+ final_results["documents"] = [[r["documents"][0] for r in results]]
152
+ final_results["metadatas"] = [[r["metadatas"][0] for r in results]]
153
+ return self._docs_from_results(final_results)
148
154
 
149
155
  def delete_collection(self, collection_name: str) -> None:
150
156
  self.client.delete_collection(name=collection_name)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langroid"
3
- version = "0.1.196"
3
+ version = "0.1.198"
4
4
  description = "Harness LLMs with Multi-Agent Programming"
5
5
  authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
6
6
  readme = "README.md"
@@ -16,7 +16,7 @@ mkdocs-gen-files = "^0.4.0"
16
16
  mkdocs-literate-nav = "^0.6.0"
17
17
  mkdocs-section-index = "^0.3.5"
18
18
  mkdocs-jupyter = "^0.24.1"
19
- chromadb = "0.3.21"
19
+ chromadb = ">=0.4.21, <=0.4.23"
20
20
  onnxruntime = "1.16.1"
21
21
  fire = "^0.5.0"
22
22
  black = {extras = ["jupyter"], version = "^23.3.0"}
@@ -56,7 +56,7 @@ prettytable = "^3.8.0"
56
56
  tantivy = "^0.21.0"
57
57
  google-api-python-client = "^2.95.0"
58
58
  lxml = "^4.9.3"
59
- unstructured = {extras = ["docx", "pptx", "pdf"], version = ">=0.10.16,<0.10.18"}
59
+ unstructured = {extras = ["docx", "pptx", "pdf"], version = ">=0.10.16,<0.10.18", optional=true}
60
60
 
61
61
  sentence-transformers = {version="2.2.2", optional=true}
62
62
  torch = {version="2.0.0", optional=true}
@@ -72,7 +72,6 @@ pymupdf = "^1.23.3"
72
72
  jinja2 = "^3.1.2"
73
73
  pytest-asyncio = "^0.21.1"
74
74
  docstring-parser = "^0.15"
75
- farm-haystack = {extras = ["ocr", "preprocessing", "file-conversion", "pdf"], version = "^1.21.1"}
76
75
  meilisearch = "^0.28.3"
77
76
  meilisearch-python-sdk = "^2.2.3"
78
77
  litellm = {version = "^1.23.0", optional = true}
@@ -85,7 +84,7 @@ agent-search = {version = "^0.0.7", optional = true}
85
84
  python-docx = "^1.1.0"
86
85
  aiohttp = "^3.9.1"
87
86
  metaphor-python = {version = "^0.1.23", optional = true}
88
- chainlit = {version = "^1.0.200", optional = true}
87
+ chainlit = {version = "^1.0.301", optional = true}
89
88
  python-socketio = {version="^5.11.0", optional=true}
90
89
  duckduckgo-search = "^4.4"
91
90
 
@@ -93,6 +92,8 @@ duckduckgo-search = "^4.4"
93
92
  # install these using `poetry install -E [...]` where [...] is one of the extras below
94
93
  # or install multiple extras using, e.g., `poetry install -E "litellm mysql"
95
94
  hf-embeddings = ["sentence-transformers", "torch"]
95
+ transformers = ["transformers"]
96
+ unstructured = ["unstructured"]
96
97
  postgres = ["psycopg2", "pytest-postgresql"]
97
98
  mysql = ["pymysql", "pytest-mysql"]
98
99
  litellm = ["litellm"]
@@ -127,6 +128,7 @@ exclude = [
127
128
  "langroid/embedding_models/clustering.py",
128
129
  #TODO revisit why mypy keeps failing on gh actions, but works fine locally
129
130
  "langroid/agent/callbacks/chainlit.py",
131
+ "langroid/vector_store/chromadb.py"
130
132
  ]
131
133
  files=["langroid/*"]
132
134
  plugins = [
File without changes
File without changes