langroid 0.36.1__tar.gz → 0.37.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {langroid-0.36.1 → langroid-0.37.0}/PKG-INFO +20 -11
  2. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/doc_chat_agent.py +3 -3
  3. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/models.py +2 -2
  4. {langroid-0.36.1 → langroid-0.37.0}/langroid/exceptions.py +16 -4
  5. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/code_parser.py +1 -1
  6. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/document_parser.py +167 -64
  7. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/parser.py +6 -4
  8. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/chromadb.py +12 -1
  9. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/qdrantdb.py +1 -1
  10. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/weaviatedb.py +5 -5
  11. {langroid-0.36.1 → langroid-0.37.0}/pyproject.toml +24 -11
  12. {langroid-0.36.1 → langroid-0.37.0}/.gitignore +0 -0
  13. {langroid-0.36.1 → langroid-0.37.0}/LICENSE +0 -0
  14. {langroid-0.36.1 → langroid-0.37.0}/README.md +0 -0
  15. {langroid-0.36.1 → langroid-0.37.0}/langroid/__init__.py +0 -0
  16. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/__init__.py +0 -0
  17. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/base.py +0 -0
  18. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/batch.py +0 -0
  19. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/callbacks/__init__.py +0 -0
  20. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/callbacks/chainlit.py +0 -0
  21. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/chat_agent.py +0 -0
  22. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/chat_document.py +0 -0
  23. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/openai_assistant.py +0 -0
  24. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/__init__.py +0 -0
  25. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/arangodb/__init__.py +0 -0
  26. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  27. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/arangodb/system_messages.py +0 -0
  28. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/arangodb/tools.py +0 -0
  29. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/arangodb/utils.py +0 -0
  30. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  31. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/lance_rag/__init__.py +0 -0
  32. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  33. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  34. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  35. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/lance_tools.py +0 -0
  36. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/neo4j/__init__.py +0 -0
  37. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  38. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  39. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/neo4j/system_messages.py +0 -0
  40. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/neo4j/tools.py +0 -0
  41. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  42. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/retriever_agent.py +0 -0
  43. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/__init__.py +0 -0
  44. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  45. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/utils/__init__.py +0 -0
  46. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  47. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  48. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/utils/system_message.py +0 -0
  49. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/sql/utils/tools.py +0 -0
  50. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/special/table_chat_agent.py +0 -0
  51. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/task.py +0 -0
  52. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tool_message.py +0 -0
  53. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/__init__.py +0 -0
  54. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  55. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/file_tools.py +0 -0
  56. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/google_search_tool.py +0 -0
  57. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  58. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/orchestration.py +0 -0
  59. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/recipient_tool.py +0 -0
  60. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/retrieval_tool.py +0 -0
  61. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/rewind_tool.py +0 -0
  62. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/tools/segment_extract_tool.py +0 -0
  63. {langroid-0.36.1 → langroid-0.37.0}/langroid/agent/xml_tool_message.py +0 -0
  64. {langroid-0.36.1 → langroid-0.37.0}/langroid/cachedb/__init__.py +0 -0
  65. {langroid-0.36.1 → langroid-0.37.0}/langroid/cachedb/base.py +0 -0
  66. {langroid-0.36.1 → langroid-0.37.0}/langroid/cachedb/momento_cachedb.py +0 -0
  67. {langroid-0.36.1 → langroid-0.37.0}/langroid/cachedb/redis_cachedb.py +0 -0
  68. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/__init__.py +0 -0
  69. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/base.py +0 -0
  70. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/protoc/__init__.py +0 -0
  71. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  72. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  73. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  74. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  75. {langroid-0.36.1 → langroid-0.37.0}/langroid/embedding_models/remote_embeds.py +0 -0
  76. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/__init__.py +0 -0
  77. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/azure_openai.py +0 -0
  78. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/base.py +0 -0
  79. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/config.py +0 -0
  80. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/mock_lm.py +0 -0
  81. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/openai_gpt.py +0 -0
  82. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  83. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/prompt_formatter/base.py +0 -0
  84. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  85. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  86. {langroid-0.36.1 → langroid-0.37.0}/langroid/language_models/utils.py +0 -0
  87. {langroid-0.36.1 → langroid-0.37.0}/langroid/mytypes.py +0 -0
  88. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/__init__.py +0 -0
  89. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/agent_chats.py +0 -0
  90. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/para_sentence_split.py +0 -0
  91. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/parse_json.py +0 -0
  92. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/repo_loader.py +0 -0
  93. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/routing.py +0 -0
  94. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/search.py +0 -0
  95. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/spider.py +0 -0
  96. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/table_loader.py +0 -0
  97. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/url_loader.py +0 -0
  98. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/urls.py +0 -0
  99. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/utils.py +0 -0
  100. {langroid-0.36.1 → langroid-0.37.0}/langroid/parsing/web_search.py +0 -0
  101. {langroid-0.36.1 → langroid-0.37.0}/langroid/prompts/__init__.py +0 -0
  102. {langroid-0.36.1 → langroid-0.37.0}/langroid/prompts/dialog.py +0 -0
  103. {langroid-0.36.1 → langroid-0.37.0}/langroid/prompts/prompts_config.py +0 -0
  104. {langroid-0.36.1 → langroid-0.37.0}/langroid/prompts/templates.py +0 -0
  105. {langroid-0.36.1 → langroid-0.37.0}/langroid/py.typed +0 -0
  106. {langroid-0.36.1 → langroid-0.37.0}/langroid/pydantic_v1/__init__.py +0 -0
  107. {langroid-0.36.1 → langroid-0.37.0}/langroid/pydantic_v1/main.py +0 -0
  108. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/__init__.py +0 -0
  109. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/algorithms/__init__.py +0 -0
  110. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/algorithms/graph.py +0 -0
  111. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/configuration.py +0 -0
  112. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/constants.py +0 -0
  113. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/git_utils.py +0 -0
  114. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/globals.py +0 -0
  115. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/logging.py +0 -0
  116. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/object_registry.py +0 -0
  117. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/output/__init__.py +0 -0
  118. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/output/citations.py +0 -0
  119. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/output/printing.py +0 -0
  120. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/output/status.py +0 -0
  121. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/pandas_utils.py +0 -0
  122. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/pydantic_utils.py +0 -0
  123. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/system.py +0 -0
  124. {langroid-0.36.1 → langroid-0.37.0}/langroid/utils/types.py +0 -0
  125. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/__init__.py +0 -0
  126. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/base.py +0 -0
  127. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/lancedb.py +0 -0
  128. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/meilisearch.py +0 -0
  129. {langroid-0.36.1 → langroid-0.37.0}/langroid/vector_store/momento.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.36.1
3
+ Version: 0.37.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -12,6 +12,7 @@ Requires-Dist: async-generator<2.0,>=1.10
12
12
  Requires-Dist: bs4<1.0.0,>=0.0.1
13
13
  Requires-Dist: cerebras-cloud-sdk<2.0.0,>=1.1.0
14
14
  Requires-Dist: colorlog<7.0.0,>=6.7.0
15
+ Requires-Dist: docling<3.0.0,>=2.16.0
15
16
  Requires-Dist: docstring-parser<1.0,>=0.16
16
17
  Requires-Dist: duckduckgo-search<7.0.0,>=6.0.0
17
18
  Requires-Dist: faker<19.0.0,>=18.9.0
@@ -32,9 +33,10 @@ Requires-Dist: onnxruntime<2.0.0,>=1.16.1
32
33
  Requires-Dist: openai<2.0.0,>=1.45.0
33
34
  Requires-Dist: pandas<3.0.0,>=2.0.3
34
35
  Requires-Dist: prettytable<4.0.0,>=3.8.0
35
- Requires-Dist: pydantic<2.10.2,>=1
36
+ Requires-Dist: pydantic<3.0.0,>=1
36
37
  Requires-Dist: pygithub<2.0.0,>=1.58.1
37
38
  Requires-Dist: pygments<3.0.0,>=2.15.1
39
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17
38
40
  Requires-Dist: pyparsing<4.0.0,>=3.0.9
39
41
  Requires-Dist: pytest-rerunfailures<16.0,>=15.0
40
42
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
@@ -55,14 +57,15 @@ Provides-Extra: all
55
57
  Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'all'
56
58
  Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'all'
57
59
  Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'all'
60
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
58
61
  Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
59
- Requires-Dist: huggingface-hub<0.22.0,>=0.21.2; extra == 'all'
62
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
60
63
  Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
61
64
  Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
62
65
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
63
66
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
64
- Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'all'
65
67
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'all'
68
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'all'
66
69
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'all'
67
70
  Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'all'
68
71
  Requires-Dist: pypdf>=5.1.0; extra == 'all'
@@ -74,7 +77,7 @@ Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'all'
74
77
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'all'
75
78
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'all'
76
79
  Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'all'
77
- Requires-Dist: unstructured[docx,pdf,pptx]<0.10.18,>=0.10.16; extra == 'all'
80
+ Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'all'
78
81
  Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
79
82
  Provides-Extra: arango
80
83
  Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
@@ -89,13 +92,16 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'db'
89
92
  Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
90
93
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
91
94
  Provides-Extra: doc-chat
95
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'doc-chat'
92
96
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
93
- Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'doc-chat'
97
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
94
98
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
95
99
  Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
96
100
  Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
97
101
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
98
- Requires-Dist: unstructured[docx,pdf,pptx]<0.10.18,>=0.10.16; extra == 'doc-chat'
102
+ Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
103
+ Provides-Extra: docling
104
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
99
105
  Provides-Extra: docx
100
106
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
101
107
  Provides-Extra: fastembed
@@ -104,7 +110,7 @@ Provides-Extra: hf-embeddings
104
110
  Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-embeddings'
105
111
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-embeddings'
106
112
  Provides-Extra: hf-transformers
107
- Requires-Dist: huggingface-hub<0.22.0,>=0.21.2; extra == 'hf-transformers'
113
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'hf-transformers'
108
114
  Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-transformers'
109
115
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-transformers'
110
116
  Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'hf-transformers'
@@ -125,13 +131,16 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'mysql'
125
131
  Provides-Extra: neo4j
126
132
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
127
133
  Provides-Extra: pdf-parsers
134
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
128
135
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
129
- Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'pdf-parsers'
136
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
130
137
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
131
138
  Requires-Dist: pypdf>=5.1.0; extra == 'pdf-parsers'
132
139
  Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'pdf-parsers'
133
140
  Provides-Extra: postgres
134
141
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'postgres'
142
+ Provides-Extra: pymupdf4llm
143
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pymupdf4llm'
135
144
  Provides-Extra: scrapy
136
145
  Requires-Dist: scrapy<3.0.0,>=2.11.0; extra == 'scrapy'
137
146
  Provides-Extra: sql
@@ -139,11 +148,11 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'sql'
139
148
  Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'sql'
140
149
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'sql'
141
150
  Provides-Extra: transformers
142
- Requires-Dist: huggingface-hub<0.22.0,>=0.21.2; extra == 'transformers'
151
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'transformers'
143
152
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'transformers'
144
153
  Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'transformers'
145
154
  Provides-Extra: unstructured
146
- Requires-Dist: unstructured[docx,pdf,pptx]<0.10.18,>=0.10.16; extra == 'unstructured'
155
+ Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'unstructured'
147
156
  Provides-Extra: vecdbs
148
157
  Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'vecdbs'
149
158
  Requires-Dist: lancedb<0.9.0,>=0.8.2; extra == 'vecdbs'
@@ -100,7 +100,7 @@ hf_embed_config = SentenceTransformerEmbeddingsConfig(
100
100
 
101
101
  oai_embed_config = OpenAIEmbeddingsConfig(
102
102
  model_type="openai",
103
- model_name="text-embedding-ada-002",
103
+ model_name="text-embedding-3-small",
104
104
  dims=1536,
105
105
  )
106
106
 
@@ -189,8 +189,8 @@ class DocChatAgentConfig(ChatAgentConfig):
189
189
  # NOTE: PDF parsing is extremely challenging, and each library
190
190
  # has its own strengths and weaknesses.
191
191
  # Try one that works for your use case.
192
- # or "unstructured", "pdfplumber", "fitz", "pypdf"
193
- library="pdfplumber",
192
+ # or "unstructured", "fitz", "pymupdf4llm", "pypdf"
193
+ library="pymupdf4llm",
194
194
  ),
195
195
  )
196
196
 
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
18
18
 
19
19
  class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
20
20
  model_type: str = "openai"
21
- model_name: str = "text-embedding-ada-002"
21
+ model_name: str = "text-embedding-3-large"
22
22
  api_key: str = ""
23
23
  api_base: Optional[str] = None
24
24
  organization: str = ""
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
28
28
 
29
29
  class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
30
30
  model_type: str = "azure-openai"
31
- model_name: str = "text-embedding-ada-002"
31
+ model_name: str = "text-embedding-3-large"
32
32
  api_key: str = ""
33
33
  api_base: str = ""
34
34
  deployment_name: Optional[str] = None
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from typing import List, Optional
2
2
 
3
3
 
4
4
  class XMLException(Exception):
@@ -15,7 +15,7 @@ class LangroidImportError(ImportError):
15
15
  def __init__(
16
16
  self,
17
17
  package: Optional[str] = None,
18
- extra: Optional[str] = None,
18
+ extra: Optional[str | List[str]] = None,
19
19
  error: str = "",
20
20
  *args: object,
21
21
  ) -> None:
@@ -33,9 +33,21 @@ class LangroidImportError(ImportError):
33
33
  error = f"{package} is not installed by default with Langroid.\n"
34
34
 
35
35
  if extra:
36
+ if isinstance(extra, list):
37
+ help_preamble = f"""
38
+ If you want to use it, please install langroid with one of these
39
+ extras: {', '.join(extra)}. The examples below use the first one,
40
+ i.e. {extra[0]}.
41
+ """
42
+ extra = extra[0]
43
+ else:
44
+ help_preamble = f"""
45
+ If you want to use it, please install langroid with the
46
+ `{extra}` extra.
47
+ """
48
+
36
49
  install_help = f"""
37
- If you want to use it, please install langroid
38
- with the `{extra}` extra, for example:
50
+ {help_preamble}
39
51
 
40
52
  If you are using pip:
41
53
  pip install "langroid[{extra}]"
@@ -65,7 +65,7 @@ class CodeParsingConfig(BaseSettings):
65
65
  "bash",
66
66
  ]
67
67
  chunk_size: int = 500 # tokens
68
- token_encoding_model: str = "text-embedding-ada-002"
68
+ token_encoding_model: str = "text-embedding-3-small"
69
69
  n_similar_docs: int = 4
70
70
 
71
71
 
@@ -3,9 +3,10 @@ from __future__ import annotations
3
3
  import itertools
4
4
  import logging
5
5
  import re
6
+ import tempfile
6
7
  from enum import Enum
7
8
  from io import BytesIO
8
- from typing import TYPE_CHECKING, Any, Generator, List, Tuple
9
+ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
9
10
 
10
11
  from langroid.exceptions import LangroidImportError
11
12
  from langroid.utils.object_registry import ObjectRegistry
@@ -15,18 +16,24 @@ try:
15
16
  except ImportError:
16
17
  if not TYPE_CHECKING:
17
18
  fitz = None
19
+ try:
20
+ import pymupdf4llm
21
+ except ImportError:
22
+ if not TYPE_CHECKING:
23
+ pymupdf4llm = None
18
24
 
19
25
  try:
20
- import pypdf
26
+ import docling
21
27
  except ImportError:
22
28
  if not TYPE_CHECKING:
23
- pypdf = None
29
+ docling = None
24
30
 
25
31
  try:
26
- import pdfplumber
32
+ import pypdf
27
33
  except ImportError:
28
34
  if not TYPE_CHECKING:
29
- pdfplumber = None
35
+ pypdf = None
36
+
30
37
 
31
38
  import requests
32
39
  from bs4 import BeautifulSoup
@@ -41,6 +48,7 @@ logger = logging.getLogger(__name__)
41
48
 
42
49
 
43
50
  class DocumentType(str, Enum):
51
+ # TODO add `md` (Markdown) and `html`
44
52
  PDF = "pdf"
45
53
  DOCX = "docx"
46
54
  DOC = "doc"
@@ -139,10 +147,12 @@ class DocumentParser(Parser):
139
147
  if inferred_doc_type == DocumentType.PDF:
140
148
  if config.pdf.library == "fitz":
141
149
  return FitzPDFParser(source, config)
150
+ elif config.pdf.library == "pymupdf4llm":
151
+ return PyMuPDF4LLMParser(source, config)
152
+ elif config.pdf.library == "docling":
153
+ return DoclingParser(source, config)
142
154
  elif config.pdf.library == "pypdf":
143
155
  return PyPDFParser(source, config)
144
- elif config.pdf.library == "pdfplumber":
145
- return PDFPlumberParser(source, config)
146
156
  elif config.pdf.library == "unstructured":
147
157
  return UnstructuredPDFParser(source, config)
148
158
  elif config.pdf.library == "pdf2image":
@@ -307,8 +317,11 @@ class DocumentParser(Parser):
307
317
  """Yield each page in the PDF."""
308
318
  raise NotImplementedError
309
319
 
310
- def extract_text_from_page(self, page: Any) -> str:
311
- """Extract text from a given page."""
320
+ def get_document_from_page(self, page: Any) -> Document:
321
+ """
322
+ Get Langroid Document object (with possible metadata)
323
+ corresponding to a given page.
324
+ """
312
325
  raise NotImplementedError
313
326
 
314
327
  def fix_text(self, text: str) -> str:
@@ -335,7 +348,10 @@ class DocumentParser(Parser):
335
348
  """
336
349
 
337
350
  text = "".join(
338
- [self.extract_text_from_page(page) for _, page in self.iterate_pages()]
351
+ [
352
+ self.get_document_from_page(page).content
353
+ for _, page in self.iterate_pages()
354
+ ]
339
355
  )
340
356
  return Document(content=text, metadata=DocMetaData(source=self.source))
341
357
 
@@ -359,7 +375,10 @@ class DocumentParser(Parser):
359
375
  common_id = ObjectRegistry.new_id()
360
376
  n_chunks = 0 # how many chunk so far
361
377
  for i, page in self.iterate_pages():
362
- page_text = self.extract_text_from_page(page)
378
+ # not used but could be useful, esp to blend the
379
+ # metadata from the pages into the chunks
380
+ page_doc = self.get_document_from_page(page)
381
+ page_text = page_doc.content
363
382
  split += self.tokenizer.encode(page_text)
364
383
  pages.append(str(i + 1))
365
384
  # split could be so long it needs to be split
@@ -422,81 +441,152 @@ class FitzPDFParser(DocumentParser):
422
441
  yield i, page
423
442
  doc.close()
424
443
 
425
- def extract_text_from_page(self, page: "fitz.Page") -> str:
444
+ def get_document_from_page(self, page: "fitz.Page") -> Document:
426
445
  """
427
- Extract text from a given `fitz` page.
446
+ Get Document object from a given `fitz` page.
428
447
 
429
448
  Args:
430
449
  page (fitz.Page): The `fitz` page object.
431
450
 
432
451
  Returns:
433
- str: Extracted text from the page.
452
+ Document: Document object, with content and possible metadata.
434
453
  """
435
- return self.fix_text(page.get_text())
454
+ return Document(
455
+ content=self.fix_text(page.get_text()),
456
+ metadata=DocMetaData(source=self.source),
457
+ )
436
458
 
437
459
 
438
- class PyPDFParser(DocumentParser):
460
+ class PyMuPDF4LLMParser(DocumentParser):
439
461
  """
440
- Parser for processing PDFs using the `pypdf` library.
462
+ Parser for processing PDFs using the `pymupdf4llm` library.
441
463
  """
442
464
 
443
- def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
465
+ def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
444
466
  """
445
- Yield each page in the PDF using `pypdf`.
467
+ Yield each page in the PDF using `fitz`.
446
468
 
447
469
  Returns:
448
- Generator[pypdf.pdf.PageObject]: Generator yielding each page.
470
+ Generator[fitz.Page]: Generator yielding each page.
449
471
  """
450
- if pypdf is None:
451
- raise LangroidImportError("pypdf", "pdf-parsers")
452
- reader = pypdf.PdfReader(self.doc_bytes)
453
- for i, page in enumerate(reader.pages):
472
+ if fitz is None:
473
+ raise LangroidImportError(
474
+ "pymupdf4llm", ["pymupdf4llm", "all", "pdf-parsers", "doc-chat"]
475
+ )
476
+ doc: fitz.Document = fitz.open(stream=self.doc_bytes, filetype="pdf")
477
+ pages: List[Dict[str, Any]] = pymupdf4llm.to_markdown(doc, page_chunks=True)
478
+ for i, page in enumerate(pages):
454
479
  yield i, page
480
+ doc.close()
455
481
 
456
- def extract_text_from_page(self, page: pypdf.PageObject) -> str:
482
+ def get_document_from_page(self, page: Dict[str, Any]) -> Document:
457
483
  """
458
- Extract text from a given `pypdf` page.
484
+ Get Document object corresponding to a given "page-chunk"
485
+ dictionary, see:
486
+ https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html
487
+
459
488
 
460
489
  Args:
461
- page (pypdf.pdf.PageObject): The `pypdf` page object.
490
+ page (Dict[str,Any]): The "page-chunk" dictionary.
462
491
 
463
492
  Returns:
464
- str: Extracted text from the page.
493
+ Document: Document object, with content and possible metadata.
465
494
  """
466
- return self.fix_text(page.extract_text())
495
+ return Document(
496
+ content=self.fix_text(page.get("text", "")),
497
+ # TODO could possible use other metadata from page, see above link.
498
+ metadata=DocMetaData(source=self.source),
499
+ )
467
500
 
468
501
 
469
- class PDFPlumberParser(DocumentParser):
502
+ class DoclingParser(DocumentParser):
470
503
  """
471
- Parser for processing PDFs using the `pdfplumber` library.
504
+ Parser for processing PDFs using the `docling` library.
472
505
  """
473
506
 
474
- def iterate_pages(
475
- self,
476
- ) -> (Generator)[Tuple[int, pdfplumber.pdf.Page], None, None]: # type: ignore
507
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
508
+ """
509
+ Yield each page in the PDF using `docling`.
510
+
511
+ Returns:
512
+ Generator[docling.Page]: Generator yielding each page.
477
513
  """
478
- Yield each page in the PDF using `pdfplumber`.
514
+ if docling is None:
515
+ raise LangroidImportError(
516
+ "docling", ["docling", "pdf-parsers", "all", "doc-chat"]
517
+ )
518
+ from docling.datamodel.document import TextItem # type: ignore
519
+ from docling.document_converter import ( # type: ignore
520
+ ConversionResult,
521
+ DocumentConverter,
522
+ )
523
+
524
+ converter = DocumentConverter()
525
+ file_path = self.source
526
+ if file_path == "bytes":
527
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
528
+ tmp.write(self.doc_bytes.getvalue())
529
+ file_path = tmp.name
530
+ result: ConversionResult = converter.convert(file_path)
531
+ doc = result.document
532
+ n_pages = doc.num_pages() # type: ignore
533
+ for i in range(n_pages):
534
+ texts = [
535
+ item[0].text
536
+ for item in doc.iterate_items(page_no=i + 1)
537
+ if isinstance(item[0], TextItem)
538
+ ]
539
+ text = "\n".join(texts)
540
+ yield i, text
541
+
542
+ def get_document_from_page(self, page: str) -> Document:
543
+ """
544
+ Get Document object from a given `docling` "page" (actually a chunk).
545
+
546
+ Args:
547
+ page (docling.chunking.DocChunk): The `docling` chunk
479
548
 
480
549
  Returns:
481
- Generator[pdfplumber.Page]: Generator yielding each page.
550
+ Document: Document object, with content and possible metadata.
482
551
  """
483
- if pdfplumber is None:
484
- raise LangroidImportError("pdfplumber", "pdf-parsers")
485
- with pdfplumber.open(self.doc_bytes) as pdf:
486
- for i, page in enumerate(pdf.pages):
487
- yield i, page
552
+ return Document(
553
+ content=self.fix_text(page),
554
+ metadata=DocMetaData(source=self.source),
555
+ )
556
+
488
557
 
489
- def extract_text_from_page(self, page: pdfplumber.pdf.Page) -> str: # type: ignore
558
+ class PyPDFParser(DocumentParser):
559
+ """
560
+ Parser for processing PDFs using the `pypdf` library.
561
+ """
562
+
563
+ def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
490
564
  """
491
- Extract text from a given `pdfplumber` page.
565
+ Yield each page in the PDF using `pypdf`.
566
+
567
+ Returns:
568
+ Generator[pypdf.pdf.PageObject]: Generator yielding each page.
569
+ """
570
+ if pypdf is None:
571
+ raise LangroidImportError("pypdf", "pdf-parsers")
572
+ reader = pypdf.PdfReader(self.doc_bytes)
573
+ for i, page in enumerate(reader.pages):
574
+ yield i, page
575
+
576
+ def get_document_from_page(self, page: pypdf.PageObject) -> Document:
577
+ """
578
+ Get Document object from a given `pypdf` page.
492
579
 
493
580
  Args:
494
- page (pdfplumber.Page): The `pdfplumber` page object.
581
+ page (pypdf.pdf.PageObject): The `pypdf` page object.
495
582
 
496
583
  Returns:
497
- str: Extracted text from the page.
584
+ Document: Document object, with content and possible metadata.
498
585
  """
499
- return self.fix_text(page.extract_text())
586
+ return Document(
587
+ content=self.fix_text(page.extract_text()),
588
+ metadata=DocMetaData(source=self.source),
589
+ )
500
590
 
501
591
 
502
592
  class ImagePdfParser(DocumentParser):
@@ -516,15 +606,15 @@ class ImagePdfParser(DocumentParser):
516
606
  for i, image in enumerate(images):
517
607
  yield i, image
518
608
 
519
- def extract_text_from_page(self, page: "Image") -> str: # type: ignore
609
+ def get_document_from_page(self, page: "Image") -> Document: # type: ignore
520
610
  """
521
- Extract text from a given `pdf2image` page.
611
+ Get Document object corresponding to a given `pdf2image` page.
522
612
 
523
613
  Args:
524
614
  page (Image): The PIL Image object.
525
615
 
526
616
  Returns:
527
- str: Extracted text from the image.
617
+ Document: Document object, with content and possible metadata.
528
618
  """
529
619
  try:
530
620
  import pytesseract
@@ -532,7 +622,10 @@ class ImagePdfParser(DocumentParser):
532
622
  raise LangroidImportError("pytesseract", "pdf-parsers")
533
623
 
534
624
  text = pytesseract.image_to_string(page)
535
- return self.fix_text(text)
625
+ return Document(
626
+ content=self.fix_text(text),
627
+ metadata=DocMetaData(source=self.source),
628
+ )
536
629
 
537
630
 
538
631
  class UnstructuredPDFParser(DocumentParser):
@@ -564,8 +657,8 @@ class UnstructuredPDFParser(DocumentParser):
564
657
  The `unstructured` library failed to parse the pdf.
565
658
  Please try a different library by setting the `library` field
566
659
  in the `pdf` section of the `parsing` field in the config file.
567
- Supported libraries are:
568
- fitz, pypdf, pdfplumber, unstructured
660
+ Other supported libraries are:
661
+ fitz, pymupdf4llm, pypdf
569
662
  """
570
663
  )
571
664
 
@@ -584,18 +677,21 @@ class UnstructuredPDFParser(DocumentParser):
584
677
  if page_elements:
585
678
  yield page_number, page_elements
586
679
 
587
- def extract_text_from_page(self, page: Any) -> str:
680
+ def get_document_from_page(self, page: Any) -> Document:
588
681
  """
589
- Extract text from a given `unstructured` element.
682
+ Get Document object from a given `unstructured` element.
590
683
 
591
684
  Args:
592
685
  page (unstructured element): The `unstructured` element object.
593
686
 
594
687
  Returns:
595
- str: Extracted text from the element.
688
+ Document: Document object, with content and possible metadata.
596
689
  """
597
690
  text = " ".join(el.text for el in page)
598
- return self.fix_text(text)
691
+ return Document(
692
+ content=self.fix_text(text),
693
+ metadata=DocMetaData(source=self.source),
694
+ )
599
695
 
600
696
 
601
697
  class UnstructuredDocxParser(DocumentParser):
@@ -632,9 +728,9 @@ class UnstructuredDocxParser(DocumentParser):
632
728
  if page_elements:
633
729
  yield page_number, page_elements
634
730
 
635
- def extract_text_from_page(self, page: Any) -> str:
731
+ def get_document_from_page(self, page: Any) -> Document:
636
732
  """
637
- Extract text from a given `unstructured` element.
733
+ Get Document object from a given `unstructured` element.
638
734
 
639
735
  Note:
640
736
  The concept of "pages" doesn't actually exist in the .docx file format in
@@ -647,10 +743,13 @@ class UnstructuredDocxParser(DocumentParser):
647
743
  page (unstructured element): The `unstructured` element object.
648
744
 
649
745
  Returns:
650
- str: Extracted text from the element.
746
+ Document object, with content and possible metadata.
651
747
  """
652
748
  text = " ".join(el.text for el in page)
653
- return self.fix_text(text)
749
+ return Document(
750
+ content=self.fix_text(text),
751
+ metadata=DocMetaData(source=self.source),
752
+ )
654
753
 
655
754
 
656
755
  class UnstructuredDocParser(UnstructuredDocxParser):
@@ -704,15 +803,19 @@ class PythonDocxParser(DocumentParser):
704
803
  for i, para in enumerate(doc.paragraphs, start=1):
705
804
  yield i, [para]
706
805
 
707
- def extract_text_from_page(self, page: Any) -> str:
806
+ def get_document_from_page(self, page: Any) -> Document:
708
807
  """
709
- Extract text from a given 'page', which in this case is a single paragraph.
808
+ Get Document object from a given 'page', which in this case is a single
809
+ paragraph.
710
810
 
711
811
  Args:
712
812
  page (list): A list containing a single Paragraph object.
713
813
 
714
814
  Returns:
715
- str: Extracted text from the paragraph.
815
+ Document: Document object, with content and possible metadata.
716
816
  """
717
817
  paragraph = page[0]
718
- return self.fix_text(paragraph.text)
818
+ return Document(
819
+ content=self.fix_text(paragraph.text),
820
+ metadata=DocMetaData(source=self.source),
821
+ )
@@ -23,11 +23,12 @@ class Splitter(str, Enum):
23
23
  class PdfParsingConfig(BaseSettings):
24
24
  library: Literal[
25
25
  "fitz",
26
- "pdfplumber",
26
+ "pymupdf4llm",
27
+ "docling",
27
28
  "pypdf",
28
29
  "unstructured",
29
30
  "pdf2image",
30
- ] = "pdfplumber"
31
+ ] = "pymupdf4llm"
31
32
 
32
33
 
33
34
  class DocxParsingConfig(BaseSettings):
@@ -40,6 +41,7 @@ class DocParsingConfig(BaseSettings):
40
41
 
41
42
  class ParsingConfig(BaseSettings):
42
43
  splitter: str = Splitter.TOKENS
44
+ chunk_by_page: bool = False # split by page?
43
45
  chunk_size: int = 200 # aim for this many tokens per chunk
44
46
  overlap: int = 50 # overlap between chunks
45
47
  max_chunks: int = 10_000
@@ -49,7 +51,7 @@ class ParsingConfig(BaseSettings):
49
51
  n_similar_docs: int = 4
50
52
  n_neighbor_ids: int = 5 # window size to store around each chunk
51
53
  separators: List[str] = ["\n\n", "\n", " ", ""]
52
- token_encoding_model: str = "text-embedding-ada-002"
54
+ token_encoding_model: str = "text-embedding-3-large"
53
55
  pdf: PdfParsingConfig = PdfParsingConfig()
54
56
  docx: DocxParsingConfig = DocxParsingConfig()
55
57
  doc: DocParsingConfig = DocParsingConfig()
@@ -61,7 +63,7 @@ class Parser:
61
63
  try:
62
64
  self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
63
65
  except Exception:
64
- self.tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
66
+ self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
65
67
 
66
68
  def num_tokens(self, text: str) -> int:
67
69
  tokens = self.tokenizer.encode(text)