langroid 0.44.0__tar.gz → 0.45.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {langroid-0.44.0 → langroid-0.45.1}/PKG-INFO +9 -5
  2. {langroid-0.44.0 → langroid-0.45.1}/README.md +2 -2
  3. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/base.py +1 -1
  4. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/document_parser.py +84 -0
  5. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/parser.py +20 -5
  6. {langroid-0.44.0 → langroid-0.45.1}/pyproject.toml +9 -3
  7. {langroid-0.44.0 → langroid-0.45.1}/.gitignore +0 -0
  8. {langroid-0.44.0 → langroid-0.45.1}/LICENSE +0 -0
  9. {langroid-0.44.0 → langroid-0.45.1}/langroid/__init__.py +0 -0
  10. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/__init__.py +0 -0
  11. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/batch.py +0 -0
  12. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/callbacks/__init__.py +0 -0
  13. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/callbacks/chainlit.py +0 -0
  14. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/chat_agent.py +0 -0
  15. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/chat_document.py +0 -0
  16. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/openai_assistant.py +0 -0
  17. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/__init__.py +0 -0
  18. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/arangodb/__init__.py +0 -0
  19. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  20. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/arangodb/system_messages.py +0 -0
  21. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/arangodb/tools.py +0 -0
  22. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/arangodb/utils.py +0 -0
  23. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/doc_chat_agent.py +0 -0
  24. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  25. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/lance_rag/__init__.py +0 -0
  26. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  27. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  28. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  29. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/lance_tools.py +0 -0
  30. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/neo4j/__init__.py +0 -0
  31. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  32. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  33. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/neo4j/system_messages.py +0 -0
  34. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/neo4j/tools.py +0 -0
  35. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  36. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/retriever_agent.py +0 -0
  37. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/__init__.py +0 -0
  38. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  39. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/utils/__init__.py +0 -0
  40. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  41. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  42. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/utils/system_message.py +0 -0
  43. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/sql/utils/tools.py +0 -0
  44. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/special/table_chat_agent.py +0 -0
  45. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/task.py +0 -0
  46. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tool_message.py +0 -0
  47. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/__init__.py +0 -0
  48. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  49. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/exa_search_tool.py +0 -0
  50. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/file_tools.py +0 -0
  51. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/google_search_tool.py +0 -0
  52. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  53. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/orchestration.py +0 -0
  54. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/recipient_tool.py +0 -0
  55. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/retrieval_tool.py +0 -0
  56. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/rewind_tool.py +0 -0
  57. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/segment_extract_tool.py +0 -0
  58. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/tools/tavily_search_tool.py +0 -0
  59. {langroid-0.44.0 → langroid-0.45.1}/langroid/agent/xml_tool_message.py +0 -0
  60. {langroid-0.44.0 → langroid-0.45.1}/langroid/cachedb/__init__.py +0 -0
  61. {langroid-0.44.0 → langroid-0.45.1}/langroid/cachedb/base.py +0 -0
  62. {langroid-0.44.0 → langroid-0.45.1}/langroid/cachedb/momento_cachedb.py +0 -0
  63. {langroid-0.44.0 → langroid-0.45.1}/langroid/cachedb/redis_cachedb.py +0 -0
  64. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/__init__.py +0 -0
  65. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/base.py +0 -0
  66. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/models.py +0 -0
  67. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/protoc/__init__.py +0 -0
  68. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  69. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  70. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  71. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  72. {langroid-0.44.0 → langroid-0.45.1}/langroid/embedding_models/remote_embeds.py +0 -0
  73. {langroid-0.44.0 → langroid-0.45.1}/langroid/exceptions.py +0 -0
  74. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/__init__.py +0 -0
  75. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/azure_openai.py +0 -0
  76. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/base.py +0 -0
  77. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/config.py +0 -0
  78. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/mock_lm.py +0 -0
  79. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/model_info.py +0 -0
  80. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/openai_gpt.py +0 -0
  81. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  82. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/prompt_formatter/base.py +0 -0
  83. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  84. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  85. {langroid-0.44.0 → langroid-0.45.1}/langroid/language_models/utils.py +0 -0
  86. {langroid-0.44.0 → langroid-0.45.1}/langroid/mytypes.py +0 -0
  87. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/__init__.py +0 -0
  88. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/agent_chats.py +0 -0
  89. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/code_parser.py +0 -0
  90. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/para_sentence_split.py +0 -0
  91. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/parse_json.py +0 -0
  92. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/pdf_utils.py +0 -0
  93. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/repo_loader.py +0 -0
  94. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/routing.py +0 -0
  95. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/search.py +0 -0
  96. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/spider.py +0 -0
  97. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/table_loader.py +0 -0
  98. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/url_loader.py +0 -0
  99. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/urls.py +0 -0
  100. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/utils.py +0 -0
  101. {langroid-0.44.0 → langroid-0.45.1}/langroid/parsing/web_search.py +0 -0
  102. {langroid-0.44.0 → langroid-0.45.1}/langroid/prompts/__init__.py +0 -0
  103. {langroid-0.44.0 → langroid-0.45.1}/langroid/prompts/dialog.py +0 -0
  104. {langroid-0.44.0 → langroid-0.45.1}/langroid/prompts/prompts_config.py +0 -0
  105. {langroid-0.44.0 → langroid-0.45.1}/langroid/prompts/templates.py +0 -0
  106. {langroid-0.44.0 → langroid-0.45.1}/langroid/py.typed +0 -0
  107. {langroid-0.44.0 → langroid-0.45.1}/langroid/pydantic_v1/__init__.py +0 -0
  108. {langroid-0.44.0 → langroid-0.45.1}/langroid/pydantic_v1/main.py +0 -0
  109. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/__init__.py +0 -0
  110. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/algorithms/__init__.py +0 -0
  111. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/algorithms/graph.py +0 -0
  112. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/configuration.py +0 -0
  113. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/constants.py +0 -0
  114. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/git_utils.py +0 -0
  115. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/globals.py +0 -0
  116. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/logging.py +0 -0
  117. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/object_registry.py +0 -0
  118. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/output/__init__.py +0 -0
  119. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/output/citations.py +0 -0
  120. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/output/printing.py +0 -0
  121. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/output/status.py +0 -0
  122. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/pandas_utils.py +0 -0
  123. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/pydantic_utils.py +0 -0
  124. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/system.py +0 -0
  125. {langroid-0.44.0 → langroid-0.45.1}/langroid/utils/types.py +0 -0
  126. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/__init__.py +0 -0
  127. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/base.py +0 -0
  128. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/chromadb.py +0 -0
  129. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/lancedb.py +0 -0
  130. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/meilisearch.py +0 -0
  131. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/pineconedb.py +0 -0
  132. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/postgres.py +0 -0
  133. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/qdrantdb.py +0 -0
  134. {langroid-0.44.0 → langroid-0.45.1}/langroid/vector_store/weaviatedb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.44.0
3
+ Version: 0.45.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -12,7 +12,6 @@ Requires-Dist: async-generator<2.0,>=1.10
12
12
  Requires-Dist: bs4<1.0.0,>=0.0.1
13
13
  Requires-Dist: cerebras-cloud-sdk<2.0.0,>=1.1.0
14
14
  Requires-Dist: colorlog<7.0.0,>=6.7.0
15
- Requires-Dist: docling<3.0.0,>=2.20.0
16
15
  Requires-Dist: docstring-parser<1.0,>=0.16
17
16
  Requires-Dist: duckduckgo-search<7.0.0,>=6.0.0
18
17
  Requires-Dist: exa-py>=1.8.7
@@ -49,7 +48,6 @@ Requires-Dist: redis<6.0.0,>=5.0.1
49
48
  Requires-Dist: requests-oauthlib<2.0.0,>=1.3.1
50
49
  Requires-Dist: requests<3.0.0,>=2.31.0
51
50
  Requires-Dist: rich<14.0.0,>=13.3.4
52
- Requires-Dist: tavily-python>=0.5.0
53
51
  Requires-Dist: thefuzz<1.0.0,>=0.20.0
54
52
  Requires-Dist: tiktoken<1.0.0,>=0.7.0
55
53
  Requires-Dist: trafilatura<2.0.0,>=1.5.0
@@ -63,6 +61,7 @@ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
63
61
  Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
64
62
  Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
65
63
  Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
64
+ Requires-Dist: marker-pdf; extra == 'all'
66
65
  Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
67
66
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
68
67
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
@@ -99,6 +98,7 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
99
98
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
100
99
  Provides-Extra: doc-chat
101
100
  Requires-Dist: docling<3.0.0,>=2.20.0; extra == 'doc-chat'
101
+ Requires-Dist: marker-pdf; extra == 'doc-chat'
102
102
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
103
103
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
104
104
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
@@ -138,6 +138,9 @@ Requires-Dist: pyarrow<16.0.0,>=15.0.0; extra == 'lancedb'
138
138
  Requires-Dist: tantivy<0.22.0,>=0.21.0; extra == 'lancedb'
139
139
  Provides-Extra: litellm
140
140
  Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
141
+ Provides-Extra: marker-pdf
142
+ Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
143
+ Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
141
144
  Provides-Extra: meilisearch
142
145
  Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
143
146
  Provides-Extra: metaphor
@@ -150,6 +153,7 @@ Provides-Extra: neo4j
150
153
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
151
154
  Provides-Extra: pdf-parsers
152
155
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
156
+ Requires-Dist: marker-pdf; extra == 'pdf-parsers'
153
157
  Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
154
158
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
155
159
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
@@ -791,8 +795,8 @@ wget -O .env https://raw.githubusercontent.com/langroid/langroid/main/.env-templ
791
795
  # Edit the .env file with your favorite editor (here nano), and remove any un-used settings. E.g. there are "dummy" values like "your-redis-port" etc -- if you are not using them, you MUST remove them.
792
796
  nano .env
793
797
 
794
- # launch the container
795
- docker run -it --rm -v ./.env:/langroid/.env langroid/langroid
798
+ # launch the container (the appropriate image for your architecture will be pulled automatically)
799
+ docker run -it --rm -v ./.env:/langroid/.env langroid/langroid:latest
796
800
 
797
801
  # Use this command to run any of the scripts in the `examples` directory
798
802
  python examples/<Path/To/Example.py>
@@ -599,8 +599,8 @@ wget -O .env https://raw.githubusercontent.com/langroid/langroid/main/.env-templ
599
599
  # Edit the .env file with your favorite editor (here nano), and remove any un-used settings. E.g. there are "dummy" values like "your-redis-port" etc -- if you are not using them, you MUST remove them.
600
600
  nano .env
601
601
 
602
- # launch the container
603
- docker run -it --rm -v ./.env:/langroid/.env langroid/langroid
602
+ # launch the container (the appropriate image for your architecture will be pulled automatically)
603
+ docker run -it --rm -v ./.env:/langroid/.env langroid/langroid:latest
604
604
 
605
605
  # Use this command to run any of the scripts in the `examples` directory
606
606
  python examples/<Path/To/Example.py>
@@ -1016,7 +1016,7 @@ class Agent(ABC):
1016
1016
  # we would have already displayed the msg "live" ONLY if
1017
1017
  # streaming was enabled, AND we did not find a cached response
1018
1018
  # If we are here, it means the response has not yet been displayed.
1019
- cached = f"[red]{self.indent}(cached)[/red]" if response.cached else ""
1019
+ cached = "[red](cached)[/red]" if response.cached else ""
1020
1020
  console.print(f"[green]{self.indent}", end="")
1021
1021
  print(cached + "[green]" + escape(response.message))
1022
1022
  self.update_token_usage(
@@ -150,6 +150,8 @@ class DocumentParser(Parser):
150
150
  return ImagePdfParser(source, config)
151
151
  elif config.pdf.library == "gemini":
152
152
  return GeminiPdfParser(source, config)
153
+ elif config.pdf.library == "marker":
154
+ return MarkerPdfParser(source, config)
153
155
  else:
154
156
  raise ValueError(
155
157
  f"Unsupported PDF library specified: {config.pdf.library}"
@@ -1356,3 +1358,85 @@ class GeminiPdfParser(DocumentParser):
1356
1358
  content=page,
1357
1359
  metadata=DocMetaData(source=self.source),
1358
1360
  )
1361
+
1362
+
1363
+ class MarkerPdfParser(DocumentParser):
1364
+ DEFAULT_CONFIG = {"paginate_output": True, "output_format": "markdown"}
1365
+
1366
+ def __init__(self, source: Union[str, bytes], config: ParsingConfig):
1367
+ super().__init__(source, config)
1368
+ user_config = (
1369
+ config.pdf.marker_config.config_dict if config.pdf.marker_config else {}
1370
+ )
1371
+
1372
+ self.config_dict = {**MarkerPdfParser.DEFAULT_CONFIG, **user_config}
1373
+
1374
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
1375
+ """
1376
+ Yield each page in the PDF using `marker`.
1377
+ """
1378
+ try:
1379
+ import marker # noqa
1380
+ except ImportError:
1381
+ raise LangroidImportError(
1382
+ "marker-pdf", ["marker-pdf", "pdf-parsers", "all", "doc-chat"]
1383
+ )
1384
+
1385
+ import re
1386
+
1387
+ from marker.config.parser import ConfigParser
1388
+ from marker.converters.pdf import PdfConverter
1389
+ from marker.models import create_model_dict
1390
+ from marker.output import save_output
1391
+
1392
+ config_parser = ConfigParser(self.config_dict)
1393
+ converter = PdfConverter(
1394
+ config=config_parser.generate_config_dict(),
1395
+ artifact_dict=create_model_dict(),
1396
+ processor_list=config_parser.get_processors(),
1397
+ renderer=config_parser.get_renderer(),
1398
+ llm_service=config_parser.get_llm_service(),
1399
+ )
1400
+ doc_path = self.source
1401
+ if doc_path == "bytes":
1402
+ # write to tmp file, then use that path
1403
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
1404
+ temp_file.write(self.doc_bytes.getvalue())
1405
+ doc_path = temp_file.name
1406
+
1407
+ output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
1408
+ os.makedirs(output_dir, exist_ok=True)
1409
+ filename = Path(doc_path).stem + "_converted"
1410
+
1411
+ rendered = converter(doc_path)
1412
+ save_output(rendered, output_dir=output_dir, fname_base=filename)
1413
+ file_path = output_dir / f"{filename}.md"
1414
+
1415
+ with open(file_path, "r", encoding="utf-8") as f:
1416
+ full_markdown = f.read()
1417
+
1418
+ # Regex for splitting pages
1419
+ pages = re.split(r"\{\d+\}----+", full_markdown)
1420
+
1421
+ page_no = 0
1422
+ for page in pages:
1423
+ if page.strip():
1424
+ yield page_no, page
1425
+ page_no += 1
1426
+
1427
+ def get_document_from_page(self, page: str) -> Document:
1428
+ """
1429
+ Get Document object from a given 1-page markdown file,
1430
+ possibly containing image refs.
1431
+
1432
+ Args:
1433
+ page (str): The page we get by splitting large md file from
1434
+ marker
1435
+
1436
+ Returns:
1437
+ Document: Document object, with content and possible metadata.
1438
+ """
1439
+ return Document(
1440
+ content=self.fix_text(page),
1441
+ metadata=DocMetaData(source=self.source),
1442
+ )
@@ -38,8 +38,13 @@ class GeminiConfig(BaseSettings):
38
38
  requests_per_minute: Optional[int] = 5
39
39
 
40
40
 
41
- class PdfParsingConfig(BaseParsingConfig):
41
+ class MarkerConfig(BaseSettings):
42
+ """Configuration for Markitdown-based parsing."""
43
+
44
+ config_dict: Dict[str, Any] = {}
42
45
 
46
+
47
+ class PdfParsingConfig(BaseParsingConfig):
43
48
  library: Literal[
44
49
  "fitz",
45
50
  "pymupdf4llm",
@@ -49,16 +54,26 @@ class PdfParsingConfig(BaseParsingConfig):
49
54
  "pdf2image",
50
55
  "markitdown",
51
56
  "gemini",
57
+ "marker",
52
58
  ] = "pymupdf4llm"
53
59
  gemini_config: Optional[GeminiConfig] = None
60
+ marker_config: Optional[MarkerConfig] = None
54
61
 
55
62
  @root_validator(pre=True)
56
- def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
57
- """Ensure GeminiConfig is set only when library is 'gemini'."""
58
- if values.get("library") == "gemini":
59
- values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
63
+ def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
64
+ """Ensure correct config is set based on library selection."""
65
+ library = values.get("library")
66
+
67
+ if library == "gemini":
68
+ values.setdefault("gemini_config", GeminiConfig())
60
69
  else:
61
70
  values["gemini_config"] = None
71
+
72
+ if library == "marker":
73
+ values.setdefault("marker_config", MarkerConfig())
74
+ else:
75
+ values["marker_config"] = None
76
+
62
77
  return values
63
78
 
64
79
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "langroid"
3
- version = "0.44.0"
3
+ version = "0.45.1"
4
4
  authors = [
5
5
  {name = "Prasad Chalasani", email = "pchalasani@gmail.com"},
6
6
  ]
@@ -15,7 +15,6 @@ dependencies = [
15
15
  "bs4<1.0.0,>=0.0.1",
16
16
  "cerebras-cloud-sdk<2.0.0,>=1.1.0",
17
17
  "colorlog<7.0.0,>=6.7.0",
18
- "docling<3.0.0,>=2.20.0",
19
18
  "docstring-parser<1.0,>=0.16",
20
19
  "duckduckgo-search<7.0.0,>=6.0.0",
21
20
  "exa-py>=1.8.7",
@@ -52,7 +51,6 @@ dependencies = [
52
51
  "requests<3.0.0,>=2.31.0",
53
52
  "requests-oauthlib<2.0.0,>=1.3.1",
54
53
  "rich<14.0.0,>=13.3.4",
55
- "tavily-python>=0.5.0",
56
54
  "thefuzz<1.0.0,>=0.20.0",
57
55
  "tiktoken<1.0.0,>=0.7.0",
58
56
  "trafilatura<2.0.0,>=1.5.0",
@@ -70,6 +68,7 @@ doc-chat = [
70
68
  "pytesseract<0.4.0,>=0.3.10",
71
69
  "python-docx<2.0.0,>=1.1.0",
72
70
  "unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15",
71
+ "marker-pdf"
73
72
  ]
74
73
 
75
74
  hf-transformers = [
@@ -122,6 +121,7 @@ all = [
122
121
  "fastembed<0.4.0,>=0.3.1",
123
122
  "pgvector>=0.3.6",
124
123
  "psycopg2-binary>=2.9.10",
124
+ "marker-pdf",
125
125
  ]
126
126
 
127
127
  # More granular groupings
@@ -147,12 +147,18 @@ pdf-parsers = [
147
147
  "pdf2image<2.0.0,>=1.17.0",
148
148
  "pytesseract<0.4.0,>=0.3.10",
149
149
  "markitdown>=0.0.1a3",
150
+ "marker-pdf",
150
151
  ]
151
152
 
152
153
  docx = [
153
154
  "python-docx<2.0.0,>=1.1.0",
154
155
  ]
155
156
 
157
+ marker-pdf = [
158
+ "marker-pdf[full]>=1.6.0; sys_platform != 'darwin' or platform_machine != 'x86_64'",
159
+ "opencv-python>=4.11.0.86",
160
+ ]
161
+
156
162
  scrapy = [
157
163
  "scrapy<3.0.0,>=2.11.0",
158
164
  ]
File without changes
File without changes
File without changes
File without changes