langroid 0.45.10__tar.gz → 0.47.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. {langroid-0.45.10 → langroid-0.47.0}/PKG-INFO +3 -1
  2. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/doc_chat_agent.py +13 -3
  3. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/models.py +54 -14
  4. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/openai_gpt.py +51 -2
  5. langroid-0.47.0/langroid/parsing/url_loader.py +340 -0
  6. {langroid-0.45.10 → langroid-0.47.0}/pyproject.toml +4 -1
  7. langroid-0.45.10/langroid/parsing/url_loader.py +0 -120
  8. {langroid-0.45.10 → langroid-0.47.0}/.gitignore +0 -0
  9. {langroid-0.45.10 → langroid-0.47.0}/LICENSE +0 -0
  10. {langroid-0.45.10 → langroid-0.47.0}/README.md +0 -0
  11. {langroid-0.45.10 → langroid-0.47.0}/langroid/__init__.py +0 -0
  12. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/__init__.py +0 -0
  13. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/base.py +0 -0
  14. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/batch.py +0 -0
  15. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/callbacks/__init__.py +0 -0
  16. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/callbacks/chainlit.py +0 -0
  17. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/chat_agent.py +0 -0
  18. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/chat_document.py +0 -0
  19. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/openai_assistant.py +0 -0
  20. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/__init__.py +0 -0
  21. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/arangodb/__init__.py +0 -0
  22. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  23. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/arangodb/system_messages.py +0 -0
  24. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/arangodb/tools.py +0 -0
  25. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/arangodb/utils.py +0 -0
  26. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  27. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/lance_rag/__init__.py +0 -0
  28. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  29. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  30. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  31. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/lance_tools.py +0 -0
  32. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/neo4j/__init__.py +0 -0
  33. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  34. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  35. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/neo4j/system_messages.py +0 -0
  36. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/neo4j/tools.py +0 -0
  37. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  38. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/retriever_agent.py +0 -0
  39. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/__init__.py +0 -0
  40. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  41. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/utils/__init__.py +0 -0
  42. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  43. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  44. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/utils/system_message.py +0 -0
  45. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/sql/utils/tools.py +0 -0
  46. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/special/table_chat_agent.py +0 -0
  47. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/task.py +0 -0
  48. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tool_message.py +0 -0
  49. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/__init__.py +0 -0
  50. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  51. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/exa_search_tool.py +0 -0
  52. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/file_tools.py +0 -0
  53. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/google_search_tool.py +0 -0
  54. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  55. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/orchestration.py +0 -0
  56. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/recipient_tool.py +0 -0
  57. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/retrieval_tool.py +0 -0
  58. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/rewind_tool.py +0 -0
  59. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/segment_extract_tool.py +0 -0
  60. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/tools/tavily_search_tool.py +0 -0
  61. {langroid-0.45.10 → langroid-0.47.0}/langroid/agent/xml_tool_message.py +0 -0
  62. {langroid-0.45.10 → langroid-0.47.0}/langroid/cachedb/__init__.py +0 -0
  63. {langroid-0.45.10 → langroid-0.47.0}/langroid/cachedb/base.py +0 -0
  64. {langroid-0.45.10 → langroid-0.47.0}/langroid/cachedb/momento_cachedb.py +0 -0
  65. {langroid-0.45.10 → langroid-0.47.0}/langroid/cachedb/redis_cachedb.py +0 -0
  66. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/__init__.py +0 -0
  67. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/base.py +0 -0
  68. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/protoc/__init__.py +0 -0
  69. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  70. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  71. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  72. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  73. {langroid-0.45.10 → langroid-0.47.0}/langroid/embedding_models/remote_embeds.py +0 -0
  74. {langroid-0.45.10 → langroid-0.47.0}/langroid/exceptions.py +0 -0
  75. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/__init__.py +0 -0
  76. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/azure_openai.py +0 -0
  77. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/base.py +0 -0
  78. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/config.py +0 -0
  79. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/mock_lm.py +0 -0
  80. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/model_info.py +0 -0
  81. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  82. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/prompt_formatter/base.py +0 -0
  83. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  84. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  85. {langroid-0.45.10 → langroid-0.47.0}/langroid/language_models/utils.py +0 -0
  86. {langroid-0.45.10 → langroid-0.47.0}/langroid/mytypes.py +0 -0
  87. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/__init__.py +0 -0
  88. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/agent_chats.py +0 -0
  89. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/code_parser.py +0 -0
  90. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/document_parser.py +0 -0
  91. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/para_sentence_split.py +0 -0
  92. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/parse_json.py +0 -0
  93. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/parser.py +0 -0
  94. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/pdf_utils.py +0 -0
  95. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/repo_loader.py +0 -0
  96. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/routing.py +0 -0
  97. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/search.py +0 -0
  98. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/spider.py +0 -0
  99. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/table_loader.py +0 -0
  100. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/urls.py +0 -0
  101. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/utils.py +0 -0
  102. {langroid-0.45.10 → langroid-0.47.0}/langroid/parsing/web_search.py +0 -0
  103. {langroid-0.45.10 → langroid-0.47.0}/langroid/prompts/__init__.py +0 -0
  104. {langroid-0.45.10 → langroid-0.47.0}/langroid/prompts/dialog.py +0 -0
  105. {langroid-0.45.10 → langroid-0.47.0}/langroid/prompts/prompts_config.py +0 -0
  106. {langroid-0.45.10 → langroid-0.47.0}/langroid/prompts/templates.py +0 -0
  107. {langroid-0.45.10 → langroid-0.47.0}/langroid/py.typed +0 -0
  108. {langroid-0.45.10 → langroid-0.47.0}/langroid/pydantic_v1/__init__.py +0 -0
  109. {langroid-0.45.10 → langroid-0.47.0}/langroid/pydantic_v1/main.py +0 -0
  110. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/__init__.py +0 -0
  111. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/algorithms/__init__.py +0 -0
  112. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/algorithms/graph.py +0 -0
  113. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/configuration.py +0 -0
  114. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/constants.py +0 -0
  115. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/git_utils.py +0 -0
  116. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/globals.py +0 -0
  117. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/logging.py +0 -0
  118. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/object_registry.py +0 -0
  119. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/output/__init__.py +0 -0
  120. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/output/citations.py +0 -0
  121. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/output/printing.py +0 -0
  122. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/output/status.py +0 -0
  123. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/pandas_utils.py +0 -0
  124. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/pydantic_utils.py +0 -0
  125. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/system.py +0 -0
  126. {langroid-0.45.10 → langroid-0.47.0}/langroid/utils/types.py +0 -0
  127. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/__init__.py +0 -0
  128. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/base.py +0 -0
  129. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/chromadb.py +0 -0
  130. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/lancedb.py +0 -0
  131. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/meilisearch.py +0 -0
  132. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/pineconedb.py +0 -0
  133. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/postgres.py +0 -0
  134. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/qdrantdb.py +0 -0
  135. {langroid-0.45.10 → langroid-0.47.0}/langroid/vector_store/weaviatedb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.45.10
3
+ Version: 0.47.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -121,6 +121,8 @@ Provides-Extra: exa
121
121
  Requires-Dist: exa-py>=1.8.7; extra == 'exa'
122
122
  Provides-Extra: fastembed
123
123
  Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'fastembed'
124
+ Provides-Extra: firecrawl
125
+ Requires-Dist: firecrawl-py>=1.13.5; extra == 'firecrawl'
124
126
  Provides-Extra: google-genai
125
127
  Requires-Dist: google-genai>=1.0.0; extra == 'google-genai'
126
128
  Provides-Extra: google-generativeai
@@ -50,7 +50,7 @@ from langroid.parsing.search import (
50
50
  preprocess_text,
51
51
  )
52
52
  from langroid.parsing.table_loader import describe_dataframe
53
- from langroid.parsing.url_loader import URLLoader
53
+ from langroid.parsing.url_loader import BaseCrawlerConfig, TrafilaturaConfig, URLLoader
54
54
  from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
55
55
  from langroid.prompts.prompts_config import PromptsConfig
56
56
  from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
@@ -192,6 +192,7 @@ class DocChatAgentConfig(ChatAgentConfig):
192
192
  library="pymupdf4llm",
193
193
  ),
194
194
  )
195
+ crawler_config: Optional[BaseCrawlerConfig] = TrafilaturaConfig()
195
196
 
196
197
  # Allow vecdb to be None in case we want to explicitly set it later
197
198
  vecdb: Optional[VectorStoreConfig] = QdrantDBConfig(
@@ -336,11 +337,15 @@ class DocChatAgent(ChatAgent):
336
337
  urls_meta = {u: idx2meta[u] for u in url_idxs}
337
338
  paths_meta = {p: idx2meta[p] for p in path_idxs}
338
339
  docs: List[Document] = []
339
- parser = Parser(self.config.parsing)
340
+ parser: Parser = Parser(self.config.parsing)
340
341
  if len(urls) > 0:
341
342
  for ui in url_idxs:
342
343
  meta = urls_meta.get(ui, {})
343
- loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
344
+ loader = URLLoader(
345
+ urls=[all_paths[ui]],
346
+ parsing_config=self.config.parsing,
347
+ crawler_config=self.config.crawler_config,
348
+ ) # type: ignore
344
349
  url_docs = loader.load()
345
350
  # update metadata of each doc with meta
346
351
  for d in url_docs:
@@ -466,6 +471,11 @@ class DocChatAgent(ChatAgent):
466
471
  docs = docs[: self.config.parsing.max_chunks]
467
472
  # vecdb should take care of adding docs in batches;
468
473
  # batching can be controlled via vecdb.config.batch_size
474
+ if not docs:
475
+ logging.warning(
476
+ "No documents to ingest after processing. Skipping VecDB addition."
477
+ )
478
+ return 0 # Return 0 since no documents were added
469
479
  self.vecdb.add_documents(docs)
470
480
  self.original_docs_length = self.doc_length(docs)
471
481
  self.setup_documents(docs, filter=self.config.filter)
@@ -10,6 +10,7 @@ from openai import AzureOpenAI, OpenAI
10
10
 
11
11
  from langroid.embedding_models.base import EmbeddingModel, EmbeddingModelsConfig
12
12
  from langroid.exceptions import LangroidImportError
13
+ from langroid.language_models.openai_gpt import LangDBParams
13
14
  from langroid.mytypes import Embeddings
14
15
  from langroid.parsing.utils import batched
15
16
 
@@ -24,6 +25,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
24
25
  organization: str = ""
25
26
  dims: int = 1536
26
27
  context_length: int = 8192
28
+ langdb_params: LangDBParams = LangDBParams()
27
29
 
28
30
  class Config:
29
31
  # enable auto-loading of env vars with OPENAI_ prefix, e.g.
@@ -136,11 +138,13 @@ class EmbeddingFunctionCallable:
136
138
  """
137
139
  embeds = []
138
140
  if isinstance(self.embed_model, (OpenAIEmbeddings, AzureOpenAIEmbeddings)):
139
- tokenized_texts = self.embed_model.truncate_texts(input)
141
+ # Truncate texts to context length while preserving text format
142
+ truncated_texts = self.embed_model.truncate_texts(input)
140
143
 
141
- for batch in batched(tokenized_texts, self.batch_size):
144
+ # Process in batches
145
+ for batch in batched(truncated_texts, self.batch_size):
142
146
  result = self.embed_model.client.embeddings.create(
143
- input=batch, model=self.embed_model.config.model_name
147
+ input=batch, model=self.embed_model.config.model_name # type: ignore
144
148
  )
145
149
  batch_embeds = [d.embedding for d in result.data]
146
150
  embeds.extend(batch_embeds)
@@ -183,30 +187,66 @@ class OpenAIEmbeddings(EmbeddingModel):
183
187
  super().__init__()
184
188
  self.config = config
185
189
  load_dotenv()
186
- self.config.api_key = os.getenv("OPENAI_API_KEY", "")
190
+
191
+ # Check if using LangDB
192
+ self.is_langdb = self.config.model_name.startswith("langdb/")
193
+
194
+ if self.is_langdb:
195
+ self.config.model_name = self.config.model_name.replace("langdb/", "")
196
+ self.config.api_base = self.config.langdb_params.base_url
197
+ project_id = self.config.langdb_params.project_id
198
+ if project_id:
199
+ self.config.api_base += "/" + project_id + "/v1"
200
+ self.config.api_key = self.config.langdb_params.api_key
201
+
202
+ if not self.config.api_key:
203
+ self.config.api_key = os.getenv("OPENAI_API_KEY", "")
204
+
187
205
  self.config.organization = os.getenv("OPENAI_ORGANIZATION", "")
206
+
188
207
  if self.config.api_key == "":
189
- raise ValueError(
190
- """OPENAI_API_KEY env variable must be set to use
191
- OpenAIEmbeddings. Please set the OPENAI_API_KEY value
192
- in your .env file.
193
- """
194
- )
195
- self.client = OpenAI(base_url=self.config.api_base, api_key=self.config.api_key)
208
+ if self.is_langdb:
209
+ raise ValueError(
210
+ """
211
+ LANGDB_API_KEY must be set in .env or your environment
212
+ to use OpenAIEmbeddings via LangDB.
213
+ """
214
+ )
215
+ else:
216
+ raise ValueError(
217
+ """
218
+ OPENAI_API_KEY must be set in .env or your environment
219
+ to use OpenAIEmbeddings.
220
+ """
221
+ )
222
+
223
+ self.client = OpenAI(
224
+ base_url=self.config.api_base,
225
+ api_key=self.config.api_key,
226
+ organization=self.config.organization,
227
+ )
228
+ model_for_tokenizer = self.config.model_name
229
+ if model_for_tokenizer.startswith("openai/"):
230
+ self.config.model_name = model_for_tokenizer.replace("openai/", "")
196
231
  self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)
197
232
 
198
- def truncate_texts(self, texts: List[str]) -> List[List[int]]:
233
+ def truncate_texts(self, texts: List[str]) -> List[str] | List[List[int]]:
199
234
  """
200
235
  Truncate texts to the embedding model's context length.
201
236
  TODO: Maybe we should show warning, and consider doing T5 summarization?
202
237
  """
203
- return [
238
+ truncated_tokens = [
204
239
  self.tokenizer.encode(text, disallowed_special=())[
205
240
  : self.config.context_length
206
241
  ]
207
242
  for text in texts
208
243
  ]
209
244
 
245
+ if self.is_langdb:
246
+ # LangDB embedding endpt only works with strings, not tokens
247
+ return [self.tokenizer.decode(tokens) for tokens in truncated_tokens]
248
+ return truncated_tokens
249
+
210
250
  def embedding_fn(self) -> Callable[[List[str]], Embeddings]:
211
251
  return EmbeddingFunctionCallable(self, self.config.batch_size)
212
252
 
@@ -256,7 +296,7 @@ class AzureOpenAIEmbeddings(EmbeddingModel):
256
296
  )
257
297
  self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)
258
298
 
259
- def truncate_texts(self, texts: List[str]) -> List[List[int]]:
299
+ def truncate_texts(self, texts: List[str]) -> List[str] | List[List[int]]:
260
300
  """
261
301
  Truncate texts to the embedding model's context length.
262
302
  TODO: Maybe we should show warning, and consider doing T5 summarization?
@@ -66,7 +66,7 @@ from langroid.language_models.utils import (
66
66
  retry_with_exponential_backoff,
67
67
  )
68
68
  from langroid.parsing.parse_json import parse_imperfect_json
69
- from langroid.pydantic_v1 import BaseModel
69
+ from langroid.pydantic_v1 import BaseModel, BaseSettings
70
70
  from langroid.utils.configuration import settings
71
71
  from langroid.utils.constants import Colors
72
72
  from langroid.utils.system import friendly_error
@@ -82,9 +82,13 @@ DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
82
82
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
83
83
  GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai"
84
84
  GLHF_BASE_URL = "https://glhf.chat/api/openai/v1"
85
+ LANGDB_BASE_URL = "https://api.us-east-1.langdb.ai"
85
86
  OLLAMA_API_KEY = "ollama"
86
87
  DUMMY_API_KEY = "xxx"
87
88
 
89
+ VLLM_API_KEY = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
90
+ LLAMACPP_API_KEY = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
91
+
88
92
 
89
93
  openai_chat_model_pref_list = [
90
94
  OpenAIChatModel.GPT4o,
@@ -177,6 +181,24 @@ def noop() -> None:
177
181
  return None
178
182
 
179
183
 
184
+ class LangDBParams(BaseSettings):
185
+ """
186
+ Parameters specific to LangDB integration.
187
+ """
188
+
189
+ api_key: str = DUMMY_API_KEY
190
+ project_id: str = ""
191
+ label: Optional[str] = None
192
+ run_id: Optional[str] = None
193
+ thread_id: Optional[str] = None
194
+ base_url: str = LANGDB_BASE_URL
195
+
196
+ class Config:
197
+ # allow setting of fields via env vars,
198
+ # e.g. LANGDB_PROJECT_ID=1234
199
+ env_prefix = "LANGDB_"
200
+
201
+
180
202
  class OpenAICallParams(BaseModel):
181
203
  """
182
204
  Various params that can be sent to an OpenAI API chat-completion call.
@@ -253,6 +275,8 @@ class OpenAIGPTConfig(LLMConfig):
253
275
  # e.g. "mistral-instruct-v0.2 (a fuzzy search is done to find the closest match)
254
276
  formatter: str | None = None
255
277
  hf_formatter: HFFormatter | None = None
278
+ langdb_params: LangDBParams = LangDBParams()
279
+ headers: Dict[str, str] = {}
256
280
 
257
281
  def __init__(self, **kwargs) -> None: # type: ignore
258
282
  local_model = "api_base" in kwargs and kwargs["api_base"] is not None
@@ -496,6 +520,7 @@ class OpenAIGPT(LanguageModel):
496
520
  self.is_deepseek = self.is_deepseek_model()
497
521
  self.is_glhf = self.config.chat_model.startswith("glhf/")
498
522
  self.is_openrouter = self.config.chat_model.startswith("openrouter/")
523
+ self.is_langdb = self.config.chat_model.startswith("langdb/")
499
524
 
500
525
  if self.is_groq:
501
526
  # use groq-specific client
@@ -544,18 +569,39 @@ class OpenAIGPT(LanguageModel):
544
569
  self.api_base = DEEPSEEK_BASE_URL
545
570
  if self.api_key == OPENAI_API_KEY:
546
571
  self.api_key = os.getenv("DEEPSEEK_API_KEY", DUMMY_API_KEY)
572
+ elif self.is_langdb:
573
+ self.config.chat_model = self.config.chat_model.replace("langdb/", "")
574
+ self.api_base = self.config.langdb_params.base_url
575
+ project_id = self.config.langdb_params.project_id
576
+ if project_id:
577
+ self.api_base += "/" + project_id + "/v1"
578
+ if self.api_key == OPENAI_API_KEY:
579
+ self.api_key = self.config.langdb_params.api_key or DUMMY_API_KEY
580
+
581
+ if self.config.langdb_params:
582
+ params = self.config.langdb_params
583
+ if params.project_id:
584
+ self.config.headers["x-project-id"] = params.project_id
585
+ if params.label:
586
+ self.config.headers["x-label"] = params.label
587
+ if params.run_id:
588
+ self.config.headers["x-run-id"] = params.run_id
589
+ if params.thread_id:
590
+ self.config.headers["x-thread-id"] = params.thread_id
547
591
 
548
592
  self.client = OpenAI(
549
593
  api_key=self.api_key,
550
594
  base_url=self.api_base,
551
595
  organization=self.config.organization,
552
596
  timeout=Timeout(self.config.timeout),
597
+ default_headers=self.config.headers,
553
598
  )
554
599
  self.async_client = AsyncOpenAI(
555
600
  api_key=self.api_key,
556
601
  organization=self.config.organization,
557
602
  base_url=self.api_base,
558
603
  timeout=Timeout(self.config.timeout),
604
+ default_headers=self.config.headers,
559
605
  )
560
606
 
561
607
  self.cache: CacheDB | None = None
@@ -1028,6 +1074,7 @@ class OpenAIGPT(LanguageModel):
1028
1074
  OpenAIResponse object (with choices, usage)
1029
1075
 
1030
1076
  """
1077
+
1031
1078
  completion = ""
1032
1079
  reasoning = ""
1033
1080
  function_args = ""
@@ -1075,7 +1122,9 @@ class OpenAIGPT(LanguageModel):
1075
1122
  )
1076
1123
 
1077
1124
  @staticmethod
1078
- def tool_deltas_to_tools(tools: List[Dict[str, Any]]) -> Tuple[
1125
+ def tool_deltas_to_tools(
1126
+ tools: List[Dict[str, Any]],
1127
+ ) -> Tuple[
1079
1128
  str,
1080
1129
  List[OpenAIToolCall],
1081
1130
  List[Dict[str, Any]],
@@ -0,0 +1,340 @@
1
+ import logging
2
+ import os
3
+ from abc import ABC, abstractmethod
4
+ from tempfile import NamedTemporaryFile
5
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from langroid.exceptions import LangroidImportError
10
+ from langroid.mytypes import DocMetaData, Document
11
+ from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
12
+ from langroid.parsing.parser import Parser, ParsingConfig
13
+ from langroid.pydantic_v1 import BaseSettings
14
+
15
+ if TYPE_CHECKING:
16
+ from firecrawl import FirecrawlApp
17
+
18
+ load_dotenv()
19
+
20
+ logging.getLogger("url_loader").setLevel(logging.WARNING)
21
+
22
+
23
+ # Base crawler config and specific configurations
24
+ class BaseCrawlerConfig(BaseSettings):
25
+ """Base configuration for web crawlers."""
26
+
27
+ parser: Optional[Parser] = None
28
+
29
+
30
+ class TrafilaturaConfig(BaseCrawlerConfig):
31
+ """Configuration for Trafilatura crawler."""
32
+
33
+ threads: int = 4
34
+
35
+
36
+ class FirecrawlConfig(BaseCrawlerConfig):
37
+ """Configuration for Firecrawl crawler."""
38
+
39
+ api_key: str = ""
40
+ mode: str = "scrape"
41
+ params: Dict[str, Any] = {}
42
+ timeout: Optional[int] = None
43
+
44
+ class Config:
45
+ # Leverage Pydantic's BaseSettings to
46
+ # allow setting of fields via env vars,
47
+ # e.g. FIRECRAWL_MODE=scrape and FIRECRAWL_API_KEY=...
48
+ env_prefix = "FIRECRAWL_"
49
+
50
+
51
+ class BaseCrawler(ABC):
52
+ """Abstract base class for web crawlers."""
53
+
54
+ def __init__(self, config: BaseCrawlerConfig):
55
+ """Initialize the base crawler.
56
+
57
+ Args:
58
+ config: Configuration for the crawler
59
+ """
60
+ self.parser = config.parser if self.needs_parser else None
61
+ self.config: BaseCrawlerConfig = config
62
+
63
+ @property
64
+ @abstractmethod
65
+ def needs_parser(self) -> bool:
66
+ """Indicates whether the crawler requires a parser."""
67
+ pass
68
+
69
+ @abstractmethod
70
+ def crawl(self, urls: List[str]) -> List[Document]:
71
+ pass
72
+
73
+ def _process_document(self, url: str) -> List[Document]:
74
+ if self.parser:
75
+ import requests
76
+ from requests.structures import CaseInsensitiveDict
77
+
78
+ if self._is_document_url(url):
79
+ try:
80
+ doc_parser = DocumentParser.create(url, self.parser.config)
81
+ new_chunks = doc_parser.get_doc_chunks()
82
+ if not new_chunks:
83
+ # If the document is empty, try to extract images
84
+ img_parser = ImagePdfParser(url, self.parser.config)
85
+ new_chunks = img_parser.get_doc_chunks()
86
+ return new_chunks
87
+ except Exception as e:
88
+ logging.error(f"Error parsing {url}: {e}")
89
+ return []
90
+
91
+ else:
92
+ try:
93
+ headers = requests.head(url).headers
94
+ except Exception as e:
95
+ logging.warning(f"Error getting headers for {url}: {e}")
96
+ headers = CaseInsensitiveDict()
97
+
98
+ content_type = headers.get("Content-Type", "").lower()
99
+ temp_file_suffix = None
100
+ if "application/pdf" in content_type:
101
+ temp_file_suffix = ".pdf"
102
+ elif (
103
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
104
+ in content_type
105
+ ):
106
+ temp_file_suffix = ".docx"
107
+ elif "application/msword" in content_type:
108
+ temp_file_suffix = ".doc"
109
+
110
+ if temp_file_suffix:
111
+ try:
112
+ response = requests.get(url)
113
+ with NamedTemporaryFile(
114
+ delete=False, suffix=temp_file_suffix
115
+ ) as temp_file:
116
+ temp_file.write(response.content)
117
+ temp_file_path = temp_file.name
118
+ doc_parser = DocumentParser.create(
119
+ temp_file_path, self.parser.config
120
+ )
121
+ docs = doc_parser.get_doc_chunks()
122
+ os.remove(temp_file_path)
123
+ return docs
124
+ except Exception as e:
125
+ logging.error(f"Error downloading/parsing {url}: {e}")
126
+ return []
127
+ return []
128
+
129
+ def _is_document_url(self, url: str) -> bool:
130
+ return any(url.lower().endswith(ext) for ext in [".pdf", ".docx", ".doc"])
131
+
132
+
133
+ class CrawlerFactory:
134
+ """Factory for creating web crawlers."""
135
+
136
+ @staticmethod
137
+ def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
138
+ """Create a crawler instance based on configuration type.
139
+
140
+ Args:
141
+ config: Configuration for the crawler
142
+
143
+ Returns:
144
+ A BaseCrawler instance
145
+
146
+ Raises:
147
+ ValueError: If config type is not supported
148
+ """
149
+ if isinstance(config, TrafilaturaConfig):
150
+ return TrafilaturaCrawler(config)
151
+ elif isinstance(config, FirecrawlConfig):
152
+ return FirecrawlCrawler(config)
153
+ else:
154
+ raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
155
+
156
+
157
+ class TrafilaturaCrawler(BaseCrawler):
158
+ """Crawler implementation using Trafilatura."""
159
+
160
+ def __init__(self, config: TrafilaturaConfig):
161
+ """Initialize the Trafilatura crawler.
162
+
163
+ Args:
164
+ config: Configuration for the crawler
165
+ """
166
+ super().__init__(config)
167
+ self.config: TrafilaturaConfig = config
168
+
169
+ @property
170
+ def needs_parser(self) -> bool:
171
+ return True
172
+
173
+ def crawl(self, urls: List[str]) -> List[Document]:
174
+ import trafilatura
175
+ from trafilatura.downloads import (
176
+ add_to_compressed_dict,
177
+ buffered_downloads,
178
+ load_download_buffer,
179
+ )
180
+
181
+ docs = []
182
+ dl_dict = add_to_compressed_dict(urls)
183
+
184
+ while not dl_dict.done:
185
+ buffer, dl_dict = load_download_buffer(dl_dict, sleep_time=5)
186
+ for url, result in buffered_downloads(buffer, self.config.threads):
187
+ parsed_doc = self._process_document(url)
188
+ if parsed_doc:
189
+ docs.extend(parsed_doc)
190
+ else:
191
+ text = trafilatura.extract(
192
+ result, no_fallback=False, favor_recall=True
193
+ )
194
+ if text is None and result is not None and isinstance(result, str):
195
+ text = result
196
+ if text:
197
+ docs.append(
198
+ Document(content=text, metadata=DocMetaData(source=url))
199
+ )
200
+
201
+ return docs
202
+
203
+
204
+ class FirecrawlCrawler(BaseCrawler):
205
+ """Crawler implementation using Firecrawl."""
206
+
207
+ def __init__(self, config: FirecrawlConfig) -> None:
208
+ """Initialize the Firecrawl crawler.
209
+
210
+ Args:
211
+ config: Configuration for the crawler
212
+ """
213
+ super().__init__(config)
214
+ self.config: FirecrawlConfig = config
215
+
216
+ @property
217
+ def needs_parser(self) -> bool:
218
+ return False
219
+
220
+ def _return_save_incremental_results(
221
+ self, app: "FirecrawlApp", crawl_id: str, output_dir: str = "firecrawl_output"
222
+ ) -> List[Document]:
223
+ # Code used verbatim from firecrawl blog with few modifications
224
+ # https://www.firecrawl.dev/blog/mastering-the-crawl-endpoint-in-firecrawl
225
+ import json
226
+ import time
227
+ from pathlib import Path
228
+
229
+ from tqdm import tqdm
230
+
231
+ pbar = tqdm(desc="Pages saved", unit=" pages", dynamic_ncols=True)
232
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
233
+ processed_urls: set[str] = set()
234
+ docs = []
235
+
236
+ while True:
237
+ # Check current status
238
+ status = app.check_crawl_status(crawl_id)
239
+ new_pages = 0
240
+
241
+ # Save new pages
242
+ for page in status["data"]:
243
+ url = page["metadata"]["url"]
244
+ if url not in processed_urls:
245
+ content = page.get("markdown", "")
246
+ filename = f"{output_dir}/{len(processed_urls)}.md"
247
+ with open(filename, "w") as f:
248
+ f.write(content)
249
+ docs.append(
250
+ Document(content=content, metadata=DocMetaData(source=url))
251
+ )
252
+ processed_urls.add(url)
253
+ new_pages += 1
254
+ pbar.update(new_pages) # Update progress bar with new pages
255
+
256
+ # Break if crawl is complete
257
+ if status["status"] == "completed":
258
+ print(f"Saved {len(processed_urls)} pages.")
259
+ with open(f"{output_dir}/full_results.json", "w") as f:
260
+ json.dump(status, f, indent=2)
261
+ break
262
+
263
+ time.sleep(5) # Wait before checking again
264
+ return docs
265
+
266
+ def crawl(self, urls: List[str]) -> List[Document]:
267
+ try:
268
+ from firecrawl import FirecrawlApp
269
+ except ImportError:
270
+ raise LangroidImportError("firecrawl", "firecrawl")
271
+
272
+ app = FirecrawlApp(api_key=self.config.api_key)
273
+ docs = []
274
+ params = self.config.params.copy() # Create a copy of the existing params
275
+
276
+ if self.config.timeout is not None:
277
+ params["timeout"] = self.config.timeout # Add/override timeout in params
278
+
279
+ if self.config.mode == "scrape":
280
+ for url in urls:
281
+ try:
282
+ result = app.scrape_url(url, params=params)
283
+ metadata = result.get(
284
+ "metadata", {}
285
+ ) # Default to empty dict if missing
286
+ status_code = metadata.get("statusCode")
287
+
288
+ if status_code == 200:
289
+ docs.append(
290
+ Document(
291
+ content=result["markdown"],
292
+ metadata=DocMetaData(source=url),
293
+ )
294
+ )
295
+ except Exception as e:
296
+ logging.warning(
297
+ f"Firecrawl encountered an error for {url}: {e}. "
298
+ "Skipping but continuing."
299
+ )
300
+ elif self.config.mode == "crawl":
301
+ if not isinstance(urls, list) or len(urls) != 1:
302
+ raise ValueError(
303
+ "Crawl mode expects 'urls' to be a list containing a single URL."
304
+ )
305
+
306
+ # Start the crawl
307
+ crawl_status = app.async_crawl_url(url=urls[0], params=params)
308
+
309
+ # Save results incrementally
310
+ docs = self._return_save_incremental_results(app, crawl_status["id"])
311
+ return docs
312
+
313
+
314
+ class URLLoader:
315
+ """Loads URLs and extracts text using a specified crawler."""
316
+
317
+ def __init__(
318
+ self,
319
+ urls: List[Any],
320
+ parsing_config: ParsingConfig = ParsingConfig(),
321
+ crawler_config: Optional[BaseCrawlerConfig] = None,
322
+ ):
323
+ """Initialize the URL loader.
324
+
325
+ Args:
326
+ urls: List of URLs to load
327
+ parsing_config: Configuration for parsing
328
+ crawler_config: Configuration for the crawler
329
+ """
330
+ self.urls = urls
331
+ self.parsing_config = parsing_config
332
+
333
+ if crawler_config is None:
334
+ crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
335
+
336
+ self.crawler = CrawlerFactory.create_crawler(crawler_config)
337
+
338
+ def load(self) -> List[Document]:
339
+ """Load the URLs using the specified crawler."""
340
+ return self.crawler.crawl(self.urls)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "langroid"
3
- version = "0.45.10"
3
+ version = "0.47.0"
4
4
  authors = [
5
5
  {name = "Prasad Chalasani", email = "pchalasani@gmail.com"},
6
6
  ]
@@ -265,6 +265,9 @@ pinecone = [
265
265
  asyncio = [
266
266
  "asyncio>=3.4.3",
267
267
  ]
268
+ firecrawl = [
269
+ "firecrawl-py>=1.13.5",
270
+ ]
268
271
 
269
272
 
270
273
  [dependency-groups]