langroid 0.36.1__tar.gz → 0.37.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. {langroid-0.36.1 → langroid-0.37.1}/PKG-INFO +21 -11
  2. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/doc_chat_agent.py +3 -3
  3. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/models.py +2 -2
  4. {langroid-0.36.1 → langroid-0.37.1}/langroid/exceptions.py +16 -4
  5. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/code_parser.py +1 -1
  6. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/document_parser.py +161 -64
  7. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/parser.py +6 -4
  8. langroid-0.37.1/langroid/parsing/pdf_utils.py +55 -0
  9. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/chromadb.py +12 -1
  10. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/qdrantdb.py +1 -1
  11. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/weaviatedb.py +5 -5
  12. {langroid-0.36.1 → langroid-0.37.1}/pyproject.toml +25 -11
  13. {langroid-0.36.1 → langroid-0.37.1}/.gitignore +0 -0
  14. {langroid-0.36.1 → langroid-0.37.1}/LICENSE +0 -0
  15. {langroid-0.36.1 → langroid-0.37.1}/README.md +0 -0
  16. {langroid-0.36.1 → langroid-0.37.1}/langroid/__init__.py +0 -0
  17. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/__init__.py +0 -0
  18. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/base.py +0 -0
  19. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/batch.py +0 -0
  20. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/callbacks/__init__.py +0 -0
  21. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/callbacks/chainlit.py +0 -0
  22. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/chat_agent.py +0 -0
  23. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/chat_document.py +0 -0
  24. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/openai_assistant.py +0 -0
  25. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/__init__.py +0 -0
  26. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/arangodb/__init__.py +0 -0
  27. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  28. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/arangodb/system_messages.py +0 -0
  29. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/arangodb/tools.py +0 -0
  30. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/arangodb/utils.py +0 -0
  31. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  32. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/lance_rag/__init__.py +0 -0
  33. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  34. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  35. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  36. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/lance_tools.py +0 -0
  37. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/neo4j/__init__.py +0 -0
  38. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  39. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  40. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/neo4j/system_messages.py +0 -0
  41. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/neo4j/tools.py +0 -0
  42. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  43. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/retriever_agent.py +0 -0
  44. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/__init__.py +0 -0
  45. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  46. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/utils/__init__.py +0 -0
  47. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  48. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  49. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/utils/system_message.py +0 -0
  50. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/sql/utils/tools.py +0 -0
  51. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/special/table_chat_agent.py +0 -0
  52. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/task.py +0 -0
  53. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tool_message.py +0 -0
  54. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/__init__.py +0 -0
  55. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  56. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/file_tools.py +0 -0
  57. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/google_search_tool.py +0 -0
  58. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  59. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/orchestration.py +0 -0
  60. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/recipient_tool.py +0 -0
  61. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/retrieval_tool.py +0 -0
  62. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/rewind_tool.py +0 -0
  63. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/tools/segment_extract_tool.py +0 -0
  64. {langroid-0.36.1 → langroid-0.37.1}/langroid/agent/xml_tool_message.py +0 -0
  65. {langroid-0.36.1 → langroid-0.37.1}/langroid/cachedb/__init__.py +0 -0
  66. {langroid-0.36.1 → langroid-0.37.1}/langroid/cachedb/base.py +0 -0
  67. {langroid-0.36.1 → langroid-0.37.1}/langroid/cachedb/momento_cachedb.py +0 -0
  68. {langroid-0.36.1 → langroid-0.37.1}/langroid/cachedb/redis_cachedb.py +0 -0
  69. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/__init__.py +0 -0
  70. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/base.py +0 -0
  71. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/protoc/__init__.py +0 -0
  72. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  73. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  74. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  75. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  76. {langroid-0.36.1 → langroid-0.37.1}/langroid/embedding_models/remote_embeds.py +0 -0
  77. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/__init__.py +0 -0
  78. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/azure_openai.py +0 -0
  79. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/base.py +0 -0
  80. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/config.py +0 -0
  81. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/mock_lm.py +0 -0
  82. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/openai_gpt.py +0 -0
  83. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  84. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/prompt_formatter/base.py +0 -0
  85. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  86. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  87. {langroid-0.36.1 → langroid-0.37.1}/langroid/language_models/utils.py +0 -0
  88. {langroid-0.36.1 → langroid-0.37.1}/langroid/mytypes.py +0 -0
  89. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/__init__.py +0 -0
  90. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/agent_chats.py +0 -0
  91. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/para_sentence_split.py +0 -0
  92. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/parse_json.py +0 -0
  93. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/repo_loader.py +0 -0
  94. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/routing.py +0 -0
  95. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/search.py +0 -0
  96. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/spider.py +0 -0
  97. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/table_loader.py +0 -0
  98. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/url_loader.py +0 -0
  99. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/urls.py +0 -0
  100. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/utils.py +0 -0
  101. {langroid-0.36.1 → langroid-0.37.1}/langroid/parsing/web_search.py +0 -0
  102. {langroid-0.36.1 → langroid-0.37.1}/langroid/prompts/__init__.py +0 -0
  103. {langroid-0.36.1 → langroid-0.37.1}/langroid/prompts/dialog.py +0 -0
  104. {langroid-0.36.1 → langroid-0.37.1}/langroid/prompts/prompts_config.py +0 -0
  105. {langroid-0.36.1 → langroid-0.37.1}/langroid/prompts/templates.py +0 -0
  106. {langroid-0.36.1 → langroid-0.37.1}/langroid/py.typed +0 -0
  107. {langroid-0.36.1 → langroid-0.37.1}/langroid/pydantic_v1/__init__.py +0 -0
  108. {langroid-0.36.1 → langroid-0.37.1}/langroid/pydantic_v1/main.py +0 -0
  109. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/__init__.py +0 -0
  110. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/algorithms/__init__.py +0 -0
  111. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/algorithms/graph.py +0 -0
  112. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/configuration.py +0 -0
  113. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/constants.py +0 -0
  114. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/git_utils.py +0 -0
  115. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/globals.py +0 -0
  116. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/logging.py +0 -0
  117. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/object_registry.py +0 -0
  118. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/output/__init__.py +0 -0
  119. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/output/citations.py +0 -0
  120. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/output/printing.py +0 -0
  121. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/output/status.py +0 -0
  122. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/pandas_utils.py +0 -0
  123. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/pydantic_utils.py +0 -0
  124. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/system.py +0 -0
  125. {langroid-0.36.1 → langroid-0.37.1}/langroid/utils/types.py +0 -0
  126. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/__init__.py +0 -0
  127. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/base.py +0 -0
  128. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/lancedb.py +0 -0
  129. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/meilisearch.py +0 -0
  130. {langroid-0.36.1 → langroid-0.37.1}/langroid/vector_store/momento.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.36.1
3
+ Version: 0.37.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -12,6 +12,7 @@ Requires-Dist: async-generator<2.0,>=1.10
12
12
  Requires-Dist: bs4<1.0.0,>=0.0.1
13
13
  Requires-Dist: cerebras-cloud-sdk<2.0.0,>=1.1.0
14
14
  Requires-Dist: colorlog<7.0.0,>=6.7.0
15
+ Requires-Dist: docling<3.0.0,>=2.16.0
15
16
  Requires-Dist: docstring-parser<1.0,>=0.16
16
17
  Requires-Dist: duckduckgo-search<7.0.0,>=6.0.0
17
18
  Requires-Dist: faker<19.0.0,>=18.9.0
@@ -32,9 +33,10 @@ Requires-Dist: onnxruntime<2.0.0,>=1.16.1
32
33
  Requires-Dist: openai<2.0.0,>=1.45.0
33
34
  Requires-Dist: pandas<3.0.0,>=2.0.3
34
35
  Requires-Dist: prettytable<4.0.0,>=3.8.0
35
- Requires-Dist: pydantic<2.10.2,>=1
36
+ Requires-Dist: pydantic<3.0.0,>=1
36
37
  Requires-Dist: pygithub<2.0.0,>=1.58.1
37
38
  Requires-Dist: pygments<3.0.0,>=2.15.1
39
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17
38
40
  Requires-Dist: pyparsing<4.0.0,>=3.0.9
39
41
  Requires-Dist: pytest-rerunfailures<16.0,>=15.0
40
42
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
@@ -55,14 +57,15 @@ Provides-Extra: all
55
57
  Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'all'
56
58
  Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'all'
57
59
  Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'all'
60
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
58
61
  Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
59
- Requires-Dist: huggingface-hub<0.22.0,>=0.21.2; extra == 'all'
62
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
60
63
  Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
61
64
  Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
62
65
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
63
66
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
64
- Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'all'
65
67
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'all'
68
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'all'
66
69
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'all'
67
70
  Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'all'
68
71
  Requires-Dist: pypdf>=5.1.0; extra == 'all'
@@ -74,7 +77,7 @@ Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'all'
74
77
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'all'
75
78
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'all'
76
79
  Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'all'
77
- Requires-Dist: unstructured[docx,pdf,pptx]<0.10.18,>=0.10.16; extra == 'all'
80
+ Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'all'
78
81
  Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
79
82
  Provides-Extra: arango
80
83
  Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
@@ -89,13 +92,17 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'db'
89
92
  Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
90
93
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
91
94
  Provides-Extra: doc-chat
95
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'doc-chat'
92
96
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
93
- Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'doc-chat'
97
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
94
98
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
95
99
  Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
96
100
  Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
97
101
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
98
- Requires-Dist: unstructured[docx,pdf,pptx]<0.10.18,>=0.10.16; extra == 'doc-chat'
102
+ Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
103
+ Provides-Extra: docling
104
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
105
+ Requires-Dist: pypdf>=5.1.0; extra == 'docling'
99
106
  Provides-Extra: docx
100
107
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
101
108
  Provides-Extra: fastembed
@@ -104,7 +111,7 @@ Provides-Extra: hf-embeddings
104
111
  Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-embeddings'
105
112
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-embeddings'
106
113
  Provides-Extra: hf-transformers
107
- Requires-Dist: huggingface-hub<0.22.0,>=0.21.2; extra == 'hf-transformers'
114
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'hf-transformers'
108
115
  Requires-Dist: sentence-transformers<3.0.0,>=2.2.2; extra == 'hf-transformers'
109
116
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'hf-transformers'
110
117
  Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'hf-transformers'
@@ -125,13 +132,16 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'mysql'
125
132
  Provides-Extra: neo4j
126
133
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
127
134
  Provides-Extra: pdf-parsers
135
+ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
128
136
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
129
- Requires-Dist: pdfplumber<0.11.0,>=0.10.2; extra == 'pdf-parsers'
137
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
130
138
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
131
139
  Requires-Dist: pypdf>=5.1.0; extra == 'pdf-parsers'
132
140
  Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'pdf-parsers'
133
141
  Provides-Extra: postgres
134
142
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'postgres'
143
+ Provides-Extra: pymupdf4llm
144
+ Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pymupdf4llm'
135
145
  Provides-Extra: scrapy
136
146
  Requires-Dist: scrapy<3.0.0,>=2.11.0; extra == 'scrapy'
137
147
  Provides-Extra: sql
@@ -139,11 +149,11 @@ Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'sql'
139
149
  Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'sql'
140
150
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'sql'
141
151
  Provides-Extra: transformers
142
- Requires-Dist: huggingface-hub<0.22.0,>=0.21.2; extra == 'transformers'
152
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'transformers'
143
153
  Requires-Dist: torch<3.0.0,>=2.0.0; extra == 'transformers'
144
154
  Requires-Dist: transformers<5.0.0,>=4.40.1; extra == 'transformers'
145
155
  Provides-Extra: unstructured
146
- Requires-Dist: unstructured[docx,pdf,pptx]<0.10.18,>=0.10.16; extra == 'unstructured'
156
+ Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'unstructured'
147
157
  Provides-Extra: vecdbs
148
158
  Requires-Dist: chromadb<=0.4.23,>=0.4.21; extra == 'vecdbs'
149
159
  Requires-Dist: lancedb<0.9.0,>=0.8.2; extra == 'vecdbs'
@@ -100,7 +100,7 @@ hf_embed_config = SentenceTransformerEmbeddingsConfig(
100
100
 
101
101
  oai_embed_config = OpenAIEmbeddingsConfig(
102
102
  model_type="openai",
103
- model_name="text-embedding-ada-002",
103
+ model_name="text-embedding-3-small",
104
104
  dims=1536,
105
105
  )
106
106
 
@@ -189,8 +189,8 @@ class DocChatAgentConfig(ChatAgentConfig):
189
189
  # NOTE: PDF parsing is extremely challenging, and each library
190
190
  # has its own strengths and weaknesses.
191
191
  # Try one that works for your use case.
192
- # or "unstructured", "pdfplumber", "fitz", "pypdf"
193
- library="pdfplumber",
192
+ # or "unstructured", "fitz", "pymupdf4llm", "pypdf"
193
+ library="pymupdf4llm",
194
194
  ),
195
195
  )
196
196
 
@@ -18,7 +18,7 @@ AzureADTokenProvider = Callable[[], str]
18
18
 
19
19
  class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
20
20
  model_type: str = "openai"
21
- model_name: str = "text-embedding-ada-002"
21
+ model_name: str = "text-embedding-3-large"
22
22
  api_key: str = ""
23
23
  api_base: Optional[str] = None
24
24
  organization: str = ""
@@ -28,7 +28,7 @@ class OpenAIEmbeddingsConfig(EmbeddingModelsConfig):
28
28
 
29
29
  class AzureOpenAIEmbeddingsConfig(EmbeddingModelsConfig):
30
30
  model_type: str = "azure-openai"
31
- model_name: str = "text-embedding-ada-002"
31
+ model_name: str = "text-embedding-3-large"
32
32
  api_key: str = ""
33
33
  api_base: str = ""
34
34
  deployment_name: Optional[str] = None
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from typing import List, Optional
2
2
 
3
3
 
4
4
  class XMLException(Exception):
@@ -15,7 +15,7 @@ class LangroidImportError(ImportError):
15
15
  def __init__(
16
16
  self,
17
17
  package: Optional[str] = None,
18
- extra: Optional[str] = None,
18
+ extra: Optional[str | List[str]] = None,
19
19
  error: str = "",
20
20
  *args: object,
21
21
  ) -> None:
@@ -33,9 +33,21 @@ class LangroidImportError(ImportError):
33
33
  error = f"{package} is not installed by default with Langroid.\n"
34
34
 
35
35
  if extra:
36
+ if isinstance(extra, list):
37
+ help_preamble = f"""
38
+ If you want to use it, please install langroid with one of these
39
+ extras: {', '.join(extra)}. The examples below use the first one,
40
+ i.e. {extra[0]}.
41
+ """
42
+ extra = extra[0]
43
+ else:
44
+ help_preamble = f"""
45
+ If you want to use it, please install langroid with the
46
+ `{extra}` extra.
47
+ """
48
+
36
49
  install_help = f"""
37
- If you want to use it, please install langroid
38
- with the `{extra}` extra, for example:
50
+ {help_preamble}
39
51
 
40
52
  If you are using pip:
41
53
  pip install "langroid[{extra}]"
@@ -65,7 +65,7 @@ class CodeParsingConfig(BaseSettings):
65
65
  "bash",
66
66
  ]
67
67
  chunk_size: int = 500 # tokens
68
- token_encoding_model: str = "text-embedding-ada-002"
68
+ token_encoding_model: str = "text-embedding-3-small"
69
69
  n_similar_docs: int = 4
70
70
 
71
71
 
@@ -5,9 +5,10 @@ import logging
5
5
  import re
6
6
  from enum import Enum
7
7
  from io import BytesIO
8
- from typing import TYPE_CHECKING, Any, Generator, List, Tuple
8
+ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
9
9
 
10
10
  from langroid.exceptions import LangroidImportError
11
+ from langroid.parsing.pdf_utils import pdf_split_pages
11
12
  from langroid.utils.object_registry import ObjectRegistry
12
13
 
13
14
  try:
@@ -15,18 +16,24 @@ try:
15
16
  except ImportError:
16
17
  if not TYPE_CHECKING:
17
18
  fitz = None
19
+ try:
20
+ import pymupdf4llm
21
+ except ImportError:
22
+ if not TYPE_CHECKING:
23
+ pymupdf4llm = None
18
24
 
19
25
  try:
20
- import pypdf
26
+ import docling
21
27
  except ImportError:
22
28
  if not TYPE_CHECKING:
23
- pypdf = None
29
+ docling = None
24
30
 
25
31
  try:
26
- import pdfplumber
32
+ import pypdf
27
33
  except ImportError:
28
34
  if not TYPE_CHECKING:
29
- pdfplumber = None
35
+ pypdf = None
36
+
30
37
 
31
38
  import requests
32
39
  from bs4 import BeautifulSoup
@@ -41,6 +48,7 @@ logger = logging.getLogger(__name__)
41
48
 
42
49
 
43
50
  class DocumentType(str, Enum):
51
+ # TODO add `md` (Markdown) and `html`
44
52
  PDF = "pdf"
45
53
  DOCX = "docx"
46
54
  DOC = "doc"
@@ -139,10 +147,12 @@ class DocumentParser(Parser):
139
147
  if inferred_doc_type == DocumentType.PDF:
140
148
  if config.pdf.library == "fitz":
141
149
  return FitzPDFParser(source, config)
150
+ elif config.pdf.library == "pymupdf4llm":
151
+ return PyMuPDF4LLMParser(source, config)
152
+ elif config.pdf.library == "docling":
153
+ return DoclingParser(source, config)
142
154
  elif config.pdf.library == "pypdf":
143
155
  return PyPDFParser(source, config)
144
- elif config.pdf.library == "pdfplumber":
145
- return PDFPlumberParser(source, config)
146
156
  elif config.pdf.library == "unstructured":
147
157
  return UnstructuredPDFParser(source, config)
148
158
  elif config.pdf.library == "pdf2image":
@@ -307,8 +317,11 @@ class DocumentParser(Parser):
307
317
  """Yield each page in the PDF."""
308
318
  raise NotImplementedError
309
319
 
310
- def extract_text_from_page(self, page: Any) -> str:
311
- """Extract text from a given page."""
320
+ def get_document_from_page(self, page: Any) -> Document:
321
+ """
322
+ Get Langroid Document object (with possible metadata)
323
+ corresponding to a given page.
324
+ """
312
325
  raise NotImplementedError
313
326
 
314
327
  def fix_text(self, text: str) -> str:
@@ -335,7 +348,10 @@ class DocumentParser(Parser):
335
348
  """
336
349
 
337
350
  text = "".join(
338
- [self.extract_text_from_page(page) for _, page in self.iterate_pages()]
351
+ [
352
+ self.get_document_from_page(page).content
353
+ for _, page in self.iterate_pages()
354
+ ]
339
355
  )
340
356
  return Document(content=text, metadata=DocMetaData(source=self.source))
341
357
 
@@ -359,7 +375,10 @@ class DocumentParser(Parser):
359
375
  common_id = ObjectRegistry.new_id()
360
376
  n_chunks = 0 # how many chunk so far
361
377
  for i, page in self.iterate_pages():
362
- page_text = self.extract_text_from_page(page)
378
+ # not used but could be useful, esp to blend the
379
+ # metadata from the pages into the chunks
380
+ page_doc = self.get_document_from_page(page)
381
+ page_text = page_doc.content
363
382
  split += self.tokenizer.encode(page_text)
364
383
  pages.append(str(i + 1))
365
384
  # split could be so long it needs to be split
@@ -422,81 +441,146 @@ class FitzPDFParser(DocumentParser):
422
441
  yield i, page
423
442
  doc.close()
424
443
 
425
- def extract_text_from_page(self, page: "fitz.Page") -> str:
444
+ def get_document_from_page(self, page: "fitz.Page") -> Document:
426
445
  """
427
- Extract text from a given `fitz` page.
446
+ Get Document object from a given `fitz` page.
428
447
 
429
448
  Args:
430
449
  page (fitz.Page): The `fitz` page object.
431
450
 
432
451
  Returns:
433
- str: Extracted text from the page.
452
+ Document: Document object, with content and possible metadata.
434
453
  """
435
- return self.fix_text(page.get_text())
454
+ return Document(
455
+ content=self.fix_text(page.get_text()),
456
+ metadata=DocMetaData(source=self.source),
457
+ )
436
458
 
437
459
 
438
- class PyPDFParser(DocumentParser):
460
+ class PyMuPDF4LLMParser(DocumentParser):
439
461
  """
440
- Parser for processing PDFs using the `pypdf` library.
462
+ Parser for processing PDFs using the `pymupdf4llm` library.
441
463
  """
442
464
 
443
- def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
465
+ def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
444
466
  """
445
- Yield each page in the PDF using `pypdf`.
467
+ Yield each page in the PDF using `fitz`.
446
468
 
447
469
  Returns:
448
- Generator[pypdf.pdf.PageObject]: Generator yielding each page.
470
+ Generator[fitz.Page]: Generator yielding each page.
449
471
  """
450
- if pypdf is None:
451
- raise LangroidImportError("pypdf", "pdf-parsers")
452
- reader = pypdf.PdfReader(self.doc_bytes)
453
- for i, page in enumerate(reader.pages):
472
+ if fitz is None:
473
+ raise LangroidImportError(
474
+ "pymupdf4llm", ["pymupdf4llm", "all", "pdf-parsers", "doc-chat"]
475
+ )
476
+ doc: fitz.Document = fitz.open(stream=self.doc_bytes, filetype="pdf")
477
+ pages: List[Dict[str, Any]] = pymupdf4llm.to_markdown(doc, page_chunks=True)
478
+ for i, page in enumerate(pages):
454
479
  yield i, page
480
+ doc.close()
455
481
 
456
- def extract_text_from_page(self, page: pypdf.PageObject) -> str:
482
+ def get_document_from_page(self, page: Dict[str, Any]) -> Document:
457
483
  """
458
- Extract text from a given `pypdf` page.
484
+ Get Document object corresponding to a given "page-chunk"
485
+ dictionary, see:
486
+ https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html
487
+
459
488
 
460
489
  Args:
461
- page (pypdf.pdf.PageObject): The `pypdf` page object.
490
+ page (Dict[str,Any]): The "page-chunk" dictionary.
462
491
 
463
492
  Returns:
464
- str: Extracted text from the page.
493
+ Document: Document object, with content and possible metadata.
465
494
  """
466
- return self.fix_text(page.extract_text())
495
+ return Document(
496
+ content=self.fix_text(page.get("text", "")),
497
+ # TODO could possible use other metadata from page, see above link.
498
+ metadata=DocMetaData(source=self.source),
499
+ )
467
500
 
468
501
 
469
- class PDFPlumberParser(DocumentParser):
502
+ class DoclingParser(DocumentParser):
470
503
  """
471
- Parser for processing PDFs using the `pdfplumber` library.
504
+ Parser for processing PDFs using the `docling` library.
472
505
  """
473
506
 
474
- def iterate_pages(
475
- self,
476
- ) -> (Generator)[Tuple[int, pdfplumber.pdf.Page], None, None]: # type: ignore
507
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
508
+ """
509
+ Yield each page in the PDF using `docling`.
510
+
511
+ Returns:
512
+ Generator[docling.Page]: Generator yielding each page.
477
513
  """
478
- Yield each page in the PDF using `pdfplumber`.
514
+ if docling is None:
515
+ raise LangroidImportError(
516
+ "docling", ["docling", "pdf-parsers", "all", "doc-chat"]
517
+ )
518
+
519
+ from docling.document_converter import ( # type: ignore
520
+ ConversionResult,
521
+ DocumentConverter,
522
+ )
523
+ from docling_core.types.doc import ImageRefMode # type: ignore
524
+
525
+ page_files, tmp_dir = pdf_split_pages(self.doc_bytes)
526
+ converter = DocumentConverter()
527
+ for i, page_file in enumerate(page_files):
528
+ result: ConversionResult = converter.convert(page_file)
529
+ md_text = result.document.export_to_markdown(
530
+ image_mode=ImageRefMode.REFERENCED
531
+ )
532
+ yield i, md_text
533
+
534
+ tmp_dir.cleanup()
535
+
536
+ def get_document_from_page(self, page: str) -> Document:
537
+ """
538
+ Get Document object from a given `docling` "page" (actually a chunk).
539
+
540
+ Args:
541
+ page (docling.chunking.DocChunk): The `docling` chunk
479
542
 
480
543
  Returns:
481
- Generator[pdfplumber.Page]: Generator yielding each page.
544
+ Document: Document object, with content and possible metadata.
482
545
  """
483
- if pdfplumber is None:
484
- raise LangroidImportError("pdfplumber", "pdf-parsers")
485
- with pdfplumber.open(self.doc_bytes) as pdf:
486
- for i, page in enumerate(pdf.pages):
487
- yield i, page
546
+ return Document(
547
+ content=self.fix_text(page),
548
+ metadata=DocMetaData(source=self.source),
549
+ )
550
+
488
551
 
489
- def extract_text_from_page(self, page: pdfplumber.pdf.Page) -> str: # type: ignore
552
+ class PyPDFParser(DocumentParser):
553
+ """
554
+ Parser for processing PDFs using the `pypdf` library.
555
+ """
556
+
557
+ def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
490
558
  """
491
- Extract text from a given `pdfplumber` page.
559
+ Yield each page in the PDF using `pypdf`.
560
+
561
+ Returns:
562
+ Generator[pypdf.pdf.PageObject]: Generator yielding each page.
563
+ """
564
+ if pypdf is None:
565
+ raise LangroidImportError("pypdf", "pdf-parsers")
566
+ reader = pypdf.PdfReader(self.doc_bytes)
567
+ for i, page in enumerate(reader.pages):
568
+ yield i, page
569
+
570
+ def get_document_from_page(self, page: pypdf.PageObject) -> Document:
571
+ """
572
+ Get Document object from a given `pypdf` page.
492
573
 
493
574
  Args:
494
- page (pdfplumber.Page): The `pdfplumber` page object.
575
+ page (pypdf.pdf.PageObject): The `pypdf` page object.
495
576
 
496
577
  Returns:
497
- str: Extracted text from the page.
578
+ Document: Document object, with content and possible metadata.
498
579
  """
499
- return self.fix_text(page.extract_text())
580
+ return Document(
581
+ content=self.fix_text(page.extract_text()),
582
+ metadata=DocMetaData(source=self.source),
583
+ )
500
584
 
501
585
 
502
586
  class ImagePdfParser(DocumentParser):
@@ -516,15 +600,15 @@ class ImagePdfParser(DocumentParser):
516
600
  for i, image in enumerate(images):
517
601
  yield i, image
518
602
 
519
- def extract_text_from_page(self, page: "Image") -> str: # type: ignore
603
+ def get_document_from_page(self, page: "Image") -> Document: # type: ignore
520
604
  """
521
- Extract text from a given `pdf2image` page.
605
+ Get Document object corresponding to a given `pdf2image` page.
522
606
 
523
607
  Args:
524
608
  page (Image): The PIL Image object.
525
609
 
526
610
  Returns:
527
- str: Extracted text from the image.
611
+ Document: Document object, with content and possible metadata.
528
612
  """
529
613
  try:
530
614
  import pytesseract
@@ -532,7 +616,10 @@ class ImagePdfParser(DocumentParser):
532
616
  raise LangroidImportError("pytesseract", "pdf-parsers")
533
617
 
534
618
  text = pytesseract.image_to_string(page)
535
- return self.fix_text(text)
619
+ return Document(
620
+ content=self.fix_text(text),
621
+ metadata=DocMetaData(source=self.source),
622
+ )
536
623
 
537
624
 
538
625
  class UnstructuredPDFParser(DocumentParser):
@@ -564,8 +651,8 @@ class UnstructuredPDFParser(DocumentParser):
564
651
  The `unstructured` library failed to parse the pdf.
565
652
  Please try a different library by setting the `library` field
566
653
  in the `pdf` section of the `parsing` field in the config file.
567
- Supported libraries are:
568
- fitz, pypdf, pdfplumber, unstructured
654
+ Other supported libraries are:
655
+ fitz, pymupdf4llm, pypdf
569
656
  """
570
657
  )
571
658
 
@@ -584,18 +671,21 @@ class UnstructuredPDFParser(DocumentParser):
584
671
  if page_elements:
585
672
  yield page_number, page_elements
586
673
 
587
- def extract_text_from_page(self, page: Any) -> str:
674
+ def get_document_from_page(self, page: Any) -> Document:
588
675
  """
589
- Extract text from a given `unstructured` element.
676
+ Get Document object from a given `unstructured` element.
590
677
 
591
678
  Args:
592
679
  page (unstructured element): The `unstructured` element object.
593
680
 
594
681
  Returns:
595
- str: Extracted text from the element.
682
+ Document: Document object, with content and possible metadata.
596
683
  """
597
684
  text = " ".join(el.text for el in page)
598
- return self.fix_text(text)
685
+ return Document(
686
+ content=self.fix_text(text),
687
+ metadata=DocMetaData(source=self.source),
688
+ )
599
689
 
600
690
 
601
691
  class UnstructuredDocxParser(DocumentParser):
@@ -632,9 +722,9 @@ class UnstructuredDocxParser(DocumentParser):
632
722
  if page_elements:
633
723
  yield page_number, page_elements
634
724
 
635
- def extract_text_from_page(self, page: Any) -> str:
725
+ def get_document_from_page(self, page: Any) -> Document:
636
726
  """
637
- Extract text from a given `unstructured` element.
727
+ Get Document object from a given `unstructured` element.
638
728
 
639
729
  Note:
640
730
  The concept of "pages" doesn't actually exist in the .docx file format in
@@ -647,10 +737,13 @@ class UnstructuredDocxParser(DocumentParser):
647
737
  page (unstructured element): The `unstructured` element object.
648
738
 
649
739
  Returns:
650
- str: Extracted text from the element.
740
+ Document object, with content and possible metadata.
651
741
  """
652
742
  text = " ".join(el.text for el in page)
653
- return self.fix_text(text)
743
+ return Document(
744
+ content=self.fix_text(text),
745
+ metadata=DocMetaData(source=self.source),
746
+ )
654
747
 
655
748
 
656
749
  class UnstructuredDocParser(UnstructuredDocxParser):
@@ -704,15 +797,19 @@ class PythonDocxParser(DocumentParser):
704
797
  for i, para in enumerate(doc.paragraphs, start=1):
705
798
  yield i, [para]
706
799
 
707
- def extract_text_from_page(self, page: Any) -> str:
800
+ def get_document_from_page(self, page: Any) -> Document:
708
801
  """
709
- Extract text from a given 'page', which in this case is a single paragraph.
802
+ Get Document object from a given 'page', which in this case is a single
803
+ paragraph.
710
804
 
711
805
  Args:
712
806
  page (list): A list containing a single Paragraph object.
713
807
 
714
808
  Returns:
715
- str: Extracted text from the paragraph.
809
+ Document: Document object, with content and possible metadata.
716
810
  """
717
811
  paragraph = page[0]
718
- return self.fix_text(paragraph.text)
812
+ return Document(
813
+ content=self.fix_text(paragraph.text),
814
+ metadata=DocMetaData(source=self.source),
815
+ )
@@ -23,11 +23,12 @@ class Splitter(str, Enum):
23
23
  class PdfParsingConfig(BaseSettings):
24
24
  library: Literal[
25
25
  "fitz",
26
- "pdfplumber",
26
+ "pymupdf4llm",
27
+ "docling",
27
28
  "pypdf",
28
29
  "unstructured",
29
30
  "pdf2image",
30
- ] = "pdfplumber"
31
+ ] = "pymupdf4llm"
31
32
 
32
33
 
33
34
  class DocxParsingConfig(BaseSettings):
@@ -40,6 +41,7 @@ class DocParsingConfig(BaseSettings):
40
41
 
41
42
  class ParsingConfig(BaseSettings):
42
43
  splitter: str = Splitter.TOKENS
44
+ chunk_by_page: bool = False # split by page?
43
45
  chunk_size: int = 200 # aim for this many tokens per chunk
44
46
  overlap: int = 50 # overlap between chunks
45
47
  max_chunks: int = 10_000
@@ -49,7 +51,7 @@ class ParsingConfig(BaseSettings):
49
51
  n_similar_docs: int = 4
50
52
  n_neighbor_ids: int = 5 # window size to store around each chunk
51
53
  separators: List[str] = ["\n\n", "\n", " ", ""]
52
- token_encoding_model: str = "text-embedding-ada-002"
54
+ token_encoding_model: str = "text-embedding-3-large"
53
55
  pdf: PdfParsingConfig = PdfParsingConfig()
54
56
  docx: DocxParsingConfig = DocxParsingConfig()
55
57
  doc: DocParsingConfig = DocParsingConfig()
@@ -61,7 +63,7 @@ class Parser:
61
63
  try:
62
64
  self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
63
65
  except Exception:
64
- self.tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
66
+ self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
65
67
 
66
68
  def num_tokens(self, text: str) -> int:
67
69
  tokens = self.tokenizer.encode(text)