langroid 0.1.133__tar.gz → 0.1.135__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {langroid-0.1.133 → langroid-0.1.135}/PKG-INFO +10 -10
  2. {langroid-0.1.133 → langroid-0.1.135}/README.md +9 -9
  3. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/base.py +3 -0
  4. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/task.py +1 -0
  5. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/parser.py +56 -37
  6. {langroid-0.1.133 → langroid-0.1.135}/pyproject.toml +1 -1
  7. {langroid-0.1.133 → langroid-0.1.135}/LICENSE +0 -0
  8. {langroid-0.1.133 → langroid-0.1.135}/langroid/__init__.py +0 -0
  9. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/__init__.py +0 -0
  10. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/batch.py +0 -0
  11. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/chat_agent.py +0 -0
  12. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/chat_document.py +0 -0
  13. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/helpers.py +0 -0
  14. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/junk +0 -0
  15. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/openai_assistant.py +0 -0
  16. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/__init__.py +0 -0
  17. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/doc_chat_agent.py +0 -0
  18. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/recipient_validator_agent.py +0 -0
  19. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  20. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/retriever_agent.py +0 -0
  21. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/__init__.py +0 -0
  22. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  23. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/utils/__init__.py +0 -0
  24. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  25. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  26. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/utils/system_message.py +0 -0
  27. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/sql/utils/tools.py +0 -0
  28. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/special/table_chat_agent.py +0 -0
  29. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tool_message.py +0 -0
  30. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/__init__.py +0 -0
  31. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/extract_tool.py +0 -0
  32. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/generator_tool.py +0 -0
  33. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/google_search_tool.py +0 -0
  34. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/recipient_tool.py +0 -0
  35. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/run_python_code.py +0 -0
  36. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent/tools/segment_extract_tool.py +0 -0
  37. {langroid-0.1.133 → langroid-0.1.135}/langroid/agent_config.py +0 -0
  38. {langroid-0.1.133 → langroid-0.1.135}/langroid/cachedb/__init__.py +0 -0
  39. {langroid-0.1.133 → langroid-0.1.135}/langroid/cachedb/base.py +0 -0
  40. {langroid-0.1.133 → langroid-0.1.135}/langroid/cachedb/momento_cachedb.py +0 -0
  41. {langroid-0.1.133 → langroid-0.1.135}/langroid/cachedb/redis_cachedb.py +0 -0
  42. {langroid-0.1.133 → langroid-0.1.135}/langroid/embedding_models/__init__.py +0 -0
  43. {langroid-0.1.133 → langroid-0.1.135}/langroid/embedding_models/base.py +0 -0
  44. {langroid-0.1.133 → langroid-0.1.135}/langroid/embedding_models/clustering.py +0 -0
  45. {langroid-0.1.133 → langroid-0.1.135}/langroid/embedding_models/models.py +0 -0
  46. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/__init__.py +0 -0
  47. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/azure_openai.py +0 -0
  48. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/base.py +0 -0
  49. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/config.py +0 -0
  50. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/openai_assistants.py +0 -0
  51. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/openai_gpt.py +0 -0
  52. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  53. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/prompt_formatter/base.py +0 -0
  54. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  55. {langroid-0.1.133 → langroid-0.1.135}/langroid/language_models/utils.py +0 -0
  56. {langroid-0.1.133 → langroid-0.1.135}/langroid/mytypes.py +0 -0
  57. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/__init__.py +0 -0
  58. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/agent_chats.py +0 -0
  59. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/code-parsing.md +0 -0
  60. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/code_parser.py +0 -0
  61. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/config.py +0 -0
  62. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/document_parser.py +0 -0
  63. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/json.py +0 -0
  64. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/para_sentence_split.py +0 -0
  65. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/repo_loader.py +0 -0
  66. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/search.py +0 -0
  67. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/spider.py +0 -0
  68. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/table_loader.py +0 -0
  69. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/url_loader.py +0 -0
  70. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/url_loader_cookies.py +0 -0
  71. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/urls.py +0 -0
  72. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/utils.py +0 -0
  73. {langroid-0.1.133 → langroid-0.1.135}/langroid/parsing/web_search.py +0 -0
  74. {langroid-0.1.133 → langroid-0.1.135}/langroid/prompts/__init__.py +0 -0
  75. {langroid-0.1.133 → langroid-0.1.135}/langroid/prompts/dialog.py +0 -0
  76. {langroid-0.1.133 → langroid-0.1.135}/langroid/prompts/prompts_config.py +0 -0
  77. {langroid-0.1.133 → langroid-0.1.135}/langroid/prompts/templates.py +0 -0
  78. {langroid-0.1.133 → langroid-0.1.135}/langroid/prompts/transforms.py +0 -0
  79. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/__init__.py +0 -0
  80. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/algorithms/__init__.py +0 -0
  81. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/algorithms/graph.py +0 -0
  82. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/configuration.py +0 -0
  83. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/constants.py +0 -0
  84. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/docker.py +0 -0
  85. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/globals.py +0 -0
  86. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/llms/__init__.py +0 -0
  87. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/llms/strings.py +0 -0
  88. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/logging.py +0 -0
  89. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/output/__init__.py +0 -0
  90. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/output/printing.py +0 -0
  91. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/pydantic_utils.py +0 -0
  92. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/system.py +0 -0
  93. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/web/__init__.py +0 -0
  94. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/web/login.py +0 -0
  95. {langroid-0.1.133 → langroid-0.1.135}/langroid/utils/web/selenium_login.py +0 -0
  96. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/__init__.py +0 -0
  97. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/base.py +0 -0
  98. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/chromadb.py +0 -0
  99. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/lancedb.py +0 -0
  100. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/meilisearch.py +0 -0
  101. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/momento.py +0 -0
  102. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/qdrant_cloud.py +0 -0
  103. {langroid-0.1.133 → langroid-0.1.135}/langroid/vector_store/qdrantdb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.133
3
+ Version: 0.1.135
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -100,7 +100,7 @@ Description-Content-Type: text/markdown
100
100
  [![Multi-Architecture DockerHub](https://github.com/langroid/langroid/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/langroid/langroid/actions/workflows/docker-publish.yml)
101
101
 
102
102
  [![Static Badge](https://img.shields.io/badge/Documentation-blue?link=https%3A%2F%2Flangroid.github.io%2Flangroid%2F&link=https%3A%2F%2Flangroid.github.io%2Flangroid%2F)](https://langroid.github.io/langroid)
103
- [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/langroid_quick_examples.ipynb)
103
+ [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
104
104
  [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?style=flat&logo=discord&logoColor=white)](https://discord.gg/ZU36McDgDs)
105
105
  [![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?style=flat&logo=substack&logoColor=FF6719)](https://langroid.substack.com/p/langroid-harness-llms-with-multi-agent-programming)
106
106
  </div>
@@ -138,7 +138,7 @@ We welcome contributions -- See the [contributions](./CONTRIBUTING.md) document
138
138
  for ideas on what to contribute.
139
139
 
140
140
 
141
- Building LLM Applications? Prasad Chalasani is available for consulting
141
+ Building LLM Applications? [Prasad Chalasani](https://www.linkedin.com/in/pchalasani/) is available for consulting
142
142
  (advisory/development): pchalasani at gmail dot com.
143
143
 
144
144
  Sponsorship is also accepted via [GitHub Sponsors](https://github.com/sponsors/langroid)
@@ -148,12 +148,12 @@ Sponsorship is also accepted via [GitHub Sponsors](https://github.com/sponsors/l
148
148
  # Quick glimpse of coding with Langroid
149
149
  This is just a teaser; there's much more, like function-calling/tools,
150
150
  Multi-Agent Collaboration, Structured Information Extraction, DocChatAgent
151
- (RAG), SQLChatAgent, etc. Scroll down or see docs for more.
152
-
153
- :fire: Just released! See this [Colab](https://colab.research.google.com/drive/190Tk7t4AdY1P9F_NlZ33-YEoGnHweQQ0)
154
- for a walk-through of the new `OpenAIAssistant` class (with near-complete support for the new OpenAI Assistants API)
155
- in a multi-agent setting.
151
+ (RAG), SQLChatAgent, non-OpenAI local/remote LLMs, etc. Scroll down or see docs for more.
156
152
 
153
+ :fire: Just released! Updated Langroid Quick-Start [Colab](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
154
+ that builds up to a 2-agent chat example using the OpenAI ChatCompletion API.
155
+ See also this [version](https://colab.research.google.com/drive/190Tk7t4AdY1P9F_NlZ33-YEoGnHweQQ0)
156
+ that uses the OpenAI Assistants API instead.
157
157
 
158
158
  ```python
159
159
  from langroid.language_models import OpenAIGPTConfig, OpenAIChatModel, OpenAIGPT
@@ -264,7 +264,7 @@ See [this test](tests/main/test_recipient_tool.py) for example usage.
264
264
  - **Example:** [Answer questions](examples/docqa/chat-search.py) using Google Search + vecdb-retrieval from URL contents.
265
265
  - **0.1.39:** [`GoogleSearchTool`](langroid/agent/tools/google_search_tool.py) to enable Agents (their LLM) to do Google searches via function-calling/tools.
266
266
  See [this chat example](examples/basic/chat-search.py) for how easy it is to add this tool to an agent.
267
- - **Colab notebook** to try the quick-start examples: [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/langroid_quick_examples.ipynb)
267
+ - **Colab notebook** to try the quick-start examples: [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
268
268
  - **0.1.37:** Added [`SQLChatAgent`](langroid/agent/special/sql_chat_agent.py) -- thanks to our latest contributor [Rithwik Babu](https://github.com/rithwikbabu)!
269
269
  - Multi-agent Example: [Autocorrect chat](examples/basic/autocorrect.py)
270
270
  - **July 2023:**
@@ -515,7 +515,7 @@ for a detailed tutorial.
515
515
 
516
516
  Click to expand any of the code examples below.
517
517
  All of these can be run in a Colab notebook:
518
- [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/langroid_quick_examples.ipynb)
518
+ [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
519
519
 
520
520
  <details>
521
521
  <summary> <b> Direct interaction with OpenAI LLM </b> </summary>
@@ -11,7 +11,7 @@
11
11
  [![Multi-Architecture DockerHub](https://github.com/langroid/langroid/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/langroid/langroid/actions/workflows/docker-publish.yml)
12
12
 
13
13
  [![Static Badge](https://img.shields.io/badge/Documentation-blue?link=https%3A%2F%2Flangroid.github.io%2Flangroid%2F&link=https%3A%2F%2Flangroid.github.io%2Flangroid%2F)](https://langroid.github.io/langroid)
14
- [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/langroid_quick_examples.ipynb)
14
+ [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
15
15
  [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?style=flat&logo=discord&logoColor=white)](https://discord.gg/ZU36McDgDs)
16
16
  [![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?style=flat&logo=substack&logoColor=FF6719)](https://langroid.substack.com/p/langroid-harness-llms-with-multi-agent-programming)
17
17
  </div>
@@ -49,7 +49,7 @@ We welcome contributions -- See the [contributions](./CONTRIBUTING.md) document
49
49
  for ideas on what to contribute.
50
50
 
51
51
 
52
- Building LLM Applications? Prasad Chalasani is available for consulting
52
+ Building LLM Applications? [Prasad Chalasani](https://www.linkedin.com/in/pchalasani/) is available for consulting
53
53
  (advisory/development): pchalasani at gmail dot com.
54
54
 
55
55
  Sponsorship is also accepted via [GitHub Sponsors](https://github.com/sponsors/langroid)
@@ -59,12 +59,12 @@ Sponsorship is also accepted via [GitHub Sponsors](https://github.com/sponsors/l
59
59
  # Quick glimpse of coding with Langroid
60
60
  This is just a teaser; there's much more, like function-calling/tools,
61
61
  Multi-Agent Collaboration, Structured Information Extraction, DocChatAgent
62
- (RAG), SQLChatAgent, etc. Scroll down or see docs for more.
63
-
64
- :fire: Just released! See this [Colab](https://colab.research.google.com/drive/190Tk7t4AdY1P9F_NlZ33-YEoGnHweQQ0)
65
- for a walk-through of the new `OpenAIAssistant` class (with near-complete support for the new OpenAI Assistants API)
66
- in a multi-agent setting.
62
+ (RAG), SQLChatAgent, non-OpenAI local/remote LLMs, etc. Scroll down or see docs for more.
67
63
 
64
+ :fire: Just released! Updated Langroid Quick-Start [Colab](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
65
+ that builds up to a 2-agent chat example using the OpenAI ChatCompletion API.
66
+ See also this [version](https://colab.research.google.com/drive/190Tk7t4AdY1P9F_NlZ33-YEoGnHweQQ0)
67
+ that uses the OpenAI Assistants API instead.
68
68
 
69
69
  ```python
70
70
  from langroid.language_models import OpenAIGPTConfig, OpenAIChatModel, OpenAIGPT
@@ -175,7 +175,7 @@ See [this test](tests/main/test_recipient_tool.py) for example usage.
175
175
  - **Example:** [Answer questions](examples/docqa/chat-search.py) using Google Search + vecdb-retrieval from URL contents.
176
176
  - **0.1.39:** [`GoogleSearchTool`](langroid/agent/tools/google_search_tool.py) to enable Agents (their LLM) to do Google searches via function-calling/tools.
177
177
  See [this chat example](examples/basic/chat-search.py) for how easy it is to add this tool to an agent.
178
- - **Colab notebook** to try the quick-start examples: [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/langroid_quick_examples.ipynb)
178
+ - **Colab notebook** to try the quick-start examples: [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
179
179
  - **0.1.37:** Added [`SQLChatAgent`](langroid/agent/special/sql_chat_agent.py) -- thanks to our latest contributor [Rithwik Babu](https://github.com/rithwikbabu)!
180
180
  - Multi-agent Example: [Autocorrect chat](examples/basic/autocorrect.py)
181
181
  - **July 2023:**
@@ -426,7 +426,7 @@ for a detailed tutorial.
426
426
 
427
427
  Click to expand any of the code examples below.
428
428
  All of these can be run in a Colab notebook:
429
- [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/langroid_quick_examples.ipynb)
429
+ [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langroid/langroid/blob/main/examples/Langroid_quick_start.ipynb)
430
430
 
431
431
  <details>
432
432
  <summary> <b> Direct interaction with OpenAI LLM </b> </summary>
@@ -143,6 +143,9 @@ class Agent(ABC):
143
143
  def get_dialog(self) -> List[Tuple[str, str]]:
144
144
  return self.dialog
145
145
 
146
+ def clear_dialog(self) -> None:
147
+ self.dialog = []
148
+
146
149
  def _get_tool_list(
147
150
  self, message_class: Optional[Type[ToolMessage]] = None
148
151
  ) -> List[str]:
@@ -100,6 +100,7 @@ class Task:
100
100
  if isinstance(agent, ChatAgent) and len(agent.message_history) == 0 or restart:
101
101
  agent = cast(ChatAgent, agent)
102
102
  agent.clear_history(0)
103
+ agent.clear_dialog()
103
104
  # possibly change the system and user messages
104
105
  if system_message:
105
106
  # we always have at least 1 task_message
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from enum import Enum
3
- from typing import List
3
+ from typing import Dict, List
4
4
 
5
5
  import tiktoken
6
6
  from pydantic import BaseSettings
@@ -52,27 +52,42 @@ class Parser:
52
52
  return len(tokens)
53
53
 
54
54
  def add_window_ids(self, chunks: List[Document]) -> None:
55
- """Chunks are consecutive parts of a single original document.
56
- Add window_ids in metadata"""
55
+ """Chunks may belong to multiple docs, but for each doc,
56
+ they appear consecutively. Add window_ids in metadata"""
57
57
 
58
58
  # The original metadata.id (if any) is ignored since it will be same for all
59
59
  # chunks and is useless. We want a distinct id for each chunk.
60
+ orig_ids = [c.metadata.id for c in chunks]
60
61
  ids = [Document.hash_id(str(c)) for c in chunks]
62
+ id2chunk = {id: c for id, c in zip(ids, chunks)}
63
+
64
+ # group the ids by orig_id
65
+ orig_id_to_ids: Dict[str, List[str]] = {}
66
+ for orig_id, id in zip(orig_ids, ids):
67
+ if orig_id not in orig_id_to_ids:
68
+ orig_id_to_ids[orig_id] = [] # type: ignore
69
+ orig_id_to_ids[orig_id].append(id) # type: ignore
70
+
71
+ # now each orig_id maps to a sequence of ids within a single doc
61
72
 
62
73
  k = self.config.n_neighbor_ids
63
- n = len(ids)
64
- window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
65
- for i, c in enumerate(chunks):
66
- if c.content.strip() == "":
67
- continue
68
- c.metadata.window_ids = window_ids[i]
69
- c.metadata.id = ids[i]
70
- c.metadata.is_chunk = True
74
+ for orig, ids in orig_id_to_ids.items():
75
+ # ids are consecutive chunks in a single doc
76
+ n = len(ids)
77
+ window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
78
+ for i, _ in enumerate(ids):
79
+ c = id2chunk[ids[i]]
80
+ if c.content.strip() == "":
81
+ continue
82
+ c.metadata.window_ids = window_ids[i]
83
+ c.metadata.id = ids[i]
84
+ c.metadata.is_chunk = True
71
85
 
72
86
  def split_simple(self, docs: List[Document]) -> List[Document]:
73
87
  if len(self.config.separators) == 0:
74
88
  raise ValueError("Must have at least one separator")
75
89
  final_docs = []
90
+
76
91
  for d in docs:
77
92
  if d.content.strip() == "":
78
93
  continue
@@ -89,35 +104,35 @@ class Parser:
89
104
  return final_docs
90
105
 
91
106
  def split_para_sentence(self, docs: List[Document]) -> List[Document]:
92
- final_chunks = []
93
107
  chunks = docs
94
108
  while True:
95
- long_chunks = [
96
- p
97
- for p in chunks
98
- if self.num_tokens(p.content) > 1.3 * self.config.chunk_size
99
- ]
100
- if len(long_chunks) == 0:
101
- break
102
- short_chunks = [
103
- p
104
- for p in chunks
105
- if self.num_tokens(p.content) <= 1.3 * self.config.chunk_size
106
- ]
107
- final_chunks += short_chunks
108
- chunks = self._split_para_sentence_once(long_chunks)
109
- if len(chunks) == len(long_chunks):
110
- max_len = max([self.num_tokens(p.content) for p in long_chunks])
111
- logger.warning(
112
- f"""
113
- Unable to split {len(long_chunks)} long chunks
114
- using chunk_size = {self.config.chunk_size}.
115
- Max chunk size is {max_len} tokens.
116
- """
117
- )
109
+ un_splittables = 0
110
+ split_chunks = []
111
+ for c in chunks:
112
+ if c.content.strip() == "":
113
+ continue
114
+ if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
115
+ # small chunk: no need to split
116
+ split_chunks.append(c)
117
+ continue
118
+ splits = self._split_para_sentence_once([c])
119
+ un_splittables += len(splits) == 1
120
+ split_chunks += splits
121
+ if len(split_chunks) == len(chunks):
122
+ if un_splittables > 0:
123
+ max_len = max([self.num_tokens(p.content) for p in chunks])
124
+ logger.warning(
125
+ f"""
126
+ Unable to split {un_splittables} chunks
127
+ using chunk_size = {self.config.chunk_size}.
128
+ Max chunk size is {max_len} tokens.
129
+ """
130
+ )
118
131
  break # we won't be able to shorten them with current settings
132
+ chunks = split_chunks.copy()
119
133
 
120
- return final_chunks + chunks
134
+ self.add_window_ids(chunks)
135
+ return chunks
121
136
 
122
137
  def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
123
138
  final_chunks = []
@@ -132,7 +147,6 @@ class Parser:
132
147
  for c in chunks
133
148
  if c.strip() != ""
134
149
  ]
135
- self.add_window_ids(chunk_docs)
136
150
  final_chunks += chunk_docs
137
151
 
138
152
  return final_chunks
@@ -240,6 +254,11 @@ class Parser:
240
254
  def split(self, docs: List[Document]) -> List[Document]:
241
255
  if len(docs) == 0:
242
256
  return []
257
+ # create ids in metadata of docs if absent:
258
+ # we need this to distinguish docs later in add_window_ids
259
+ for d in docs:
260
+ if d.metadata.id is None:
261
+ d.metadata.id = d.id()
243
262
  # some docs are already splits, so don't split them further!
244
263
  chunked_docs = [d for d in docs if d.metadata.is_chunk]
245
264
  big_docs = [d for d in docs if not d.metadata.is_chunk]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langroid"
3
- version = "0.1.133"
3
+ version = "0.1.135"
4
4
  description = "Harness LLMs with Multi-Agent Programming"
5
5
  authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
6
6
  readme = "README.md"
File without changes