comproscanner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. comproscanner-0.1.0/LICENSE +21 -0
  2. comproscanner-0.1.0/PKG-INFO +265 -0
  3. comproscanner-0.1.0/README.md +213 -0
  4. comproscanner-0.1.0/pyproject.toml +64 -0
  5. comproscanner-0.1.0/setup.cfg +4 -0
  6. comproscanner-0.1.0/src/comproscanner/__init__.py +182 -0
  7. comproscanner-0.1.0/src/comproscanner/article_processors/__init__.py +0 -0
  8. comproscanner-0.1.0/src/comproscanner/article_processors/elsevier_processor.py +956 -0
  9. comproscanner-0.1.0/src/comproscanner/article_processors/iop_processor.py +721 -0
  10. comproscanner-0.1.0/src/comproscanner/article_processors/pdfs_processor.py +247 -0
  11. comproscanner-0.1.0/src/comproscanner/article_processors/springer_processor.py +1003 -0
  12. comproscanner-0.1.0/src/comproscanner/article_processors/wiley_processor.py +543 -0
  13. comproscanner-0.1.0/src/comproscanner/comproscanner.py +634 -0
  14. comproscanner-0.1.0/src/comproscanner/data_visualizer.py +498 -0
  15. comproscanner-0.1.0/src/comproscanner/eval_visualizer.py +1229 -0
  16. comproscanner-0.1.0/src/comproscanner/extract_flow/__init__.py +0 -0
  17. comproscanner-0.1.0/src/comproscanner/extract_flow/crews/composition_crew/composition_extraction_crew/composition_extraction_crew.py +140 -0
  18. comproscanner-0.1.0/src/comproscanner/extract_flow/crews/composition_crew/composition_format_crew/composition_format_crew.py +141 -0
  19. comproscanner-0.1.0/src/comproscanner/extract_flow/crews/materials_data_identifier_crew/materials_data_identifier_crew.py +145 -0
  20. comproscanner-0.1.0/src/comproscanner/extract_flow/crews/synthesis_crew/synthesis_extraction_crew/synthesis_extraction_crew.py +139 -0
  21. comproscanner-0.1.0/src/comproscanner/extract_flow/crews/synthesis_crew/synthesis_format_crew/synthesis_format_crew.py +138 -0
  22. comproscanner-0.1.0/src/comproscanner/extract_flow/main_extraction_flow.py +619 -0
  23. comproscanner-0.1.0/src/comproscanner/extract_flow/tools/__init__.py +0 -0
  24. comproscanner-0.1.0/src/comproscanner/extract_flow/tools/material_parser_tool.py +141 -0
  25. comproscanner-0.1.0/src/comproscanner/extract_flow/tools/rag_tool.py +214 -0
  26. comproscanner-0.1.0/src/comproscanner/metadata_extractor/__init__.py +0 -0
  27. comproscanner-0.1.0/src/comproscanner/metadata_extractor/fetch_metadata.py +546 -0
  28. comproscanner-0.1.0/src/comproscanner/metadata_extractor/filter_metadata.py +603 -0
  29. comproscanner-0.1.0/src/comproscanner/post_processing/data_cleaner.py +320 -0
  30. comproscanner-0.1.0/src/comproscanner/post_processing/evaluation/eval_flow/crews/composition_evaluation_crew/composition_evaluation_crew.py +111 -0
  31. comproscanner-0.1.0/src/comproscanner/post_processing/evaluation/eval_flow/crews/synthesis_evaluation_crew/synthesis_evaluation_crew.py +100 -0
  32. comproscanner-0.1.0/src/comproscanner/post_processing/evaluation/eval_flow/eval_flow.py +1685 -0
  33. comproscanner-0.1.0/src/comproscanner/post_processing/evaluation/semantic_evaluator.py +1541 -0
  34. comproscanner-0.1.0/src/comproscanner/post_processing/visualization/create_knowledge_graph.py +1003 -0
  35. comproscanner-0.1.0/src/comproscanner/post_processing/visualization/data_distribution_visualizers.py +1387 -0
  36. comproscanner-0.1.0/src/comproscanner/post_processing/visualization/eval_plot_visualizers.py +3681 -0
  37. comproscanner-0.1.0/src/comproscanner/utils/__init__.py +0 -0
  38. comproscanner-0.1.0/src/comproscanner/utils/common_functions.py +84 -0
  39. comproscanner-0.1.0/src/comproscanner/utils/configs/__init__.py +17 -0
  40. comproscanner-0.1.0/src/comproscanner/utils/configs/article_keywords.py +281 -0
  41. comproscanner-0.1.0/src/comproscanner/utils/configs/base_urls.py +21 -0
  42. comproscanner-0.1.0/src/comproscanner/utils/configs/custom_dictionary.py +77 -0
  43. comproscanner-0.1.0/src/comproscanner/utils/configs/database_config.py +34 -0
  44. comproscanner-0.1.0/src/comproscanner/utils/configs/llm_config.py +62 -0
  45. comproscanner-0.1.0/src/comproscanner/utils/configs/paths_config.py +20 -0
  46. comproscanner-0.1.0/src/comproscanner/utils/configs/rag_config.py +44 -0
  47. comproscanner-0.1.0/src/comproscanner/utils/data_preparator.py +419 -0
  48. comproscanner-0.1.0/src/comproscanner/utils/database_manager.py +248 -0
  49. comproscanner-0.1.0/src/comproscanner/utils/embeddings.py +200 -0
  50. comproscanner-0.1.0/src/comproscanner/utils/error_handler.py +92 -0
  51. comproscanner-0.1.0/src/comproscanner/utils/get_paper_data.py +353 -0
  52. comproscanner-0.1.0/src/comproscanner/utils/logger.py +117 -0
  53. comproscanner-0.1.0/src/comproscanner/utils/pdf_to_markdown_text.py +439 -0
  54. comproscanner-0.1.0/src/comproscanner/utils/prepare_iop_files.py +323 -0
  55. comproscanner-0.1.0/src/comproscanner/utils/save_results.py +112 -0
  56. comproscanner-0.1.0/src/comproscanner.egg-info/PKG-INFO +265 -0
  57. comproscanner-0.1.0/src/comproscanner.egg-info/SOURCES.txt +59 -0
  58. comproscanner-0.1.0/src/comproscanner.egg-info/dependency_links.txt +1 -0
  59. comproscanner-0.1.0/src/comproscanner.egg-info/requires.txt +28 -0
  60. comproscanner-0.1.0/src/comproscanner.egg-info/top_level.txt +1 -0
  61. comproscanner-0.1.0/tests/test_extract_flow.py +1516 -0
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) <year> Adam Veldhousen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,265 @@
1
+ Metadata-Version: 2.4
2
+ Name: comproscanner
3
+ Version: 0.1.0
4
+ Summary: Multi-agent system for extracting and processing structured composition-property data from scientific literature
5
+ Author-email: Aritra Roy <contact@aritraroy.live>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/slimeslab/ComProScanner
8
+ Project-URL: Bug Tracker, https://github.com/slimeslab/ComProScanner/issues
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
18
+ Classifier: Topic :: Scientific/Engineering :: Physics
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
21
+ Classifier: Topic :: Scientific/Engineering :: Visualization
22
+ Requires-Python: <3.14,>=3.12
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: requests
26
+ Requires-Dist: python-dotenv
27
+ Requires-Dist: tqdm
28
+ Requires-Dist: lxml
29
+ Requires-Dist: pandas
30
+ Requires-Dist: torch
31
+ Requires-Dist: langchain
32
+ Requires-Dist: transformers
33
+ Requires-Dist: tokenizers
34
+ Requires-Dist: mysql
35
+ Requires-Dist: mysql-connector
36
+ Requires-Dist: langchain_community
37
+ Requires-Dist: crewai
38
+ Requires-Dist: crewai-tools
39
+ Requires-Dist: chromadb
40
+ Requires-Dist: docling
41
+ Requires-Dist: sentence-transformers
42
+ Requires-Dist: neo4j
43
+ Requires-Dist: pymatgen
44
+ Requires-Dist: seaborn
45
+ Provides-Extra: dev
46
+ Requires-Dist: pytest; extra == "dev"
47
+ Requires-Dist: pytest-mock; extra == "dev"
48
+ Provides-Extra: docs
49
+ Requires-Dist: mkdocs-material; extra == "docs"
50
+ Requires-Dist: mkdocs-minify-plugin; extra == "docs"
51
+ Dynamic: license-file
52
+
53
+ <p align="center">
54
+ <img src="assets/comproscanner_logo.png" alt="ComProScanner Logo" width="500"/>
55
+ </p>
56
+
57
+ # ComProScanner
58
+
59
+ **A comprehensive Python package for extracting composition-property data from scientific articles for building databases**
60
+
61
+ [![Python Version](https://img.shields.io/badge/python-3.12%20%7C%203.13-blue.svg)](https://www.python.org/downloads/)
62
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
63
+ [![PyPI](https://img.shields.io/pypi/v/comproscanner)](https://pypi.org/project/comproscanner/)
64
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://slimeslab.github.io/ComProScanner/)
65
+
66
+ ## Overview
67
+
68
+ ComProScanner is a multi-agent framework designed to extract composition-property relationships from scientific articles in materials science. It automates the entire workflow from metadata collection to data extraction, evaluation, and visualization.
69
+
70
+ **Key Features:**
71
+
72
+ - 📚 Multi-publisher support (Elsevier, Springer, Wiley, IOP, local PDFs)
73
+ - 🤖 Agentic extraction using CrewAI framework
74
+ - 🔍 RAG-powered context retrieval for cost effective automation with accuracy
75
+ - 📊 Comprehensive evaluation and visualization tools
76
+ - 🎯 Customizable extraction workflows
77
+ - 🌐 Knowledge graph generation
78
+
79
+ ## Installation
80
+
81
+ Install from PyPI:
82
+
83
+ ```bash
84
+ pip install comproscanner
85
+ ```
86
+
87
+ Or install from source:
88
+
89
+ ```bash
90
+ git clone https://github.com/slimeslab/ComProScanner.git
91
+ cd comproscanner
92
+ pip install -e .
93
+ ```
94
+
95
+ ## Quick Start
96
+
97
+ Here's a complete example extracting piezoelectric coefficient ($d_{33}$) data:
98
+
99
+ ```python
100
+ from comproscanner import ComProScanner
101
+
102
+ # Initialize scanner
103
+ scanner = ComProScanner(main_property_keyword="piezoelectric")
104
+
105
+ # Collect metadata
106
+ scanner.collect_metadata(
107
+ base_queries=["piezoelectric", "piezoelectricity"],
108
+ extra_queries=["ceramics", "applications"]
109
+ )
110
+
111
+ # Process articles
112
+ property_keywords = {
113
+ "exact_keywords": ["d33"],
114
+ "substring_keywords": [" d 33 "]
115
+ }
116
+
117
+ scanner.process_articles(
118
+ property_keywords=property_keywords,
119
+ source_list=["elsevier", "springer"]
120
+ )
121
+
122
+ # Extract composition-property data
123
+ scanner.extract_composition_property_data(
124
+ main_extraction_keyword="d33"
125
+ )
126
+ ```
127
+
128
+ ## Workflow
129
+
130
+ <div align="center">
131
+ <img src="assets/overall_workflow.png" alt="ComProScanner Workflow" width="750"/>
132
+ </div>
133
+
134
+ The ComProScanner workflow consists of four main stages:
135
+
136
+ 1. **Metadata Retrieval** - Find relevant scientific articles
137
+ 2. **Article Collection** - Extract full-text from various publishers
138
+ 3. **Information Extraction** - Use LLM agents to extract structured data
139
+ 4. **Post Processing & Dataset Creation** - Evaluate, clean, and visualize results
140
+
141
+ ## Documentation
142
+
143
+ 📖 **Full documentation is available at [slimeslab.github.io/ComProScanner](https://slimeslab.github.io/ComProScanner/)**
144
+
145
+ - [Installation Guide](https://slimeslab.github.io/ComProScanner/getting-started/installation/)
146
+ - [Quick Start Tutorial](https://slimeslab.github.io/ComProScanner/getting-started/quick-start/)
147
+ - [User Guide](https://slimeslab.github.io/ComProScanner/usage/metadata-collection/)
148
+ - [RAG Configuration](https://slimeslab.github.io/ComProScanner/rag-config/)
149
+
150
+ ## Core Capabilities
151
+
152
+ ### Supported Publishers
153
+
154
+ - **Elsevier** (via TDM API)
155
+ - **Springer Nature** (via TDM API)
156
+ - **Wiley** (via TDM API)
157
+ - **IOP Publishing** (via SFTP bulk access)
158
+ - **Local PDFs** (any publication)
159
+
160
+ ### Data Extraction
161
+
162
+ - Composition-property relationships
163
+ - Material families
164
+ - Synthesis methods and precursors
165
+ - Characterization techniques
166
+ - Synthesis steps
167
+
168
+ ### Evaluation Methods
169
+
170
+ - **Semantic Evaluation** - Using semantic similarity measures
171
+ - **Agentic Evaluation** - LLM-powered contextual analysis
172
+
173
+ ### Visualization
174
+
175
+ - Data Visualization
176
+ - Evaluation Visualization
177
+
178
+ ## Example Use Cases
179
+
180
+ ### Extract Data from Multiple Sources
181
+
182
+ ```python
183
+ scanner.process_articles(
184
+ property_keywords=property_keywords,
185
+ source_list=["elsevier", "springer", "wiley"]
186
+ )
187
+ ```
188
+
189
+ ### Customize RAG Configuration
190
+
191
+ ```python
192
+ scanner.extract_composition_property_data(
193
+ main_extraction_keyword="d33",
194
+ rag_chat_model="gemini-2.5-pro",
195
+ rag_max_tokens=2048,
196
+ rag_top_k=5
197
+ )
198
+ ```
199
+
200
+ ### Visualize Results
201
+
202
+ ```python
203
+ from comproscanner import data_visualizer, eval_visualizer
204
+
205
+ # Create knowledge graph
206
+ data_visualizer.create_knowledge_graph(result_file="results.json")
207
+
208
+ # Plot evaluation metrics
209
+ eval_visualizer.plot_multiple_radar_charts(
210
+ result_sources=["model1.json", "model2.json"],
211
+ model_names=["GPT-4o", "Claude-3.5"]
212
+ )
213
+ ```
214
+
215
+ ## Requirements
216
+
217
+ - Python 3.12 or 3.13
218
+ - TDM API keys for desired publishers (Elsevier, Springer, Wiley)
219
+ - LLM API keys (OpenAI, Anthropic, Google, etc.)
220
+ - Optional: Neo4j for knowledge graph visualization
221
+
222
+ ## Citation
223
+
224
+ If you use ComProScanner in your research, please cite:
225
+
226
+ ```bibtex
227
+ @misc{roy2025comproscanner,
228
+ title={ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature},
229
+ author={Aritra Roy and Enrico Grisan and John Buckeridge and Chiara Gattinoni},
230
+ year={2025},
231
+ eprint={example},
232
+ archivePrefix={arXiv},
233
+ primaryClass={cond-mat.mtrl-sci},
234
+ url={https://arxiv.org/abs/example},
235
+ }
236
+ ```
237
+
238
+ ## Contributing
239
+
240
+ We welcome contributions! Please see our [Contributing Guidelines](https://slimeslab.github.io/ComProScanner/about/contribution/) for details.
241
+
242
+ ## License
243
+
244
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
245
+
246
+ Copyright (c) 2025 SLIMES Lab
247
+
248
+ ## Contact
249
+
250
+ **Author:** Aritra Roy
251
+
252
+ - 🌐 Website: [aritraroy.live](https://aritraroy.live)
253
+ - 📧 Email: [contact@aritraroy.live](mailto:contact@aritraroy.live)
254
+ - 🐙 GitHub: [@aritraroy24](https://github.com/aritraroy24)
255
+ - 𝕏 Twitter: [@aritraroy24](https://twitter.com/aritraroy24)
256
+
257
+ **Project Links:**
258
+
259
+ - 📦 PyPI: [pypi.org/project/comproscanner](https://pypi.org/project/comproscanner/)
260
+ - 📖 Documentation: [slimeslab.github.io/ComProScanner](https://slimeslab.github.io/ComProScanner/)
261
+ - 🐛 Issues: [github.com/slimeslab/ComProScanner/issues](https://github.com/slimeslab/ComProScanner/issues)
262
+
263
+ ---
264
+
265
+ Made with ❤️ by [SLIMES Lab](https://slimeslab.github.io)
@@ -0,0 +1,213 @@
1
+ <p align="center">
2
+ <img src="assets/comproscanner_logo.png" alt="ComProScanner Logo" width="500"/>
3
+ </p>
4
+
5
+ # ComProScanner
6
+
7
+ **A comprehensive Python package for extracting composition-property data from scientific articles for building databases**
8
+
9
+ [![Python Version](https://img.shields.io/badge/python-3.12%20%7C%203.13-blue.svg)](https://www.python.org/downloads/)
10
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
11
+ [![PyPI](https://img.shields.io/pypi/v/comproscanner)](https://pypi.org/project/comproscanner/)
12
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://slimeslab.github.io/ComProScanner/)
13
+
14
+ ## Overview
15
+
16
+ ComProScanner is a multi-agent framework designed to extract composition-property relationships from scientific articles in materials science. It automates the entire workflow from metadata collection to data extraction, evaluation, and visualization.
17
+
18
+ **Key Features:**
19
+
20
+ - 📚 Multi-publisher support (Elsevier, Springer, Wiley, IOP, local PDFs)
21
+ - 🤖 Agentic extraction using CrewAI framework
22
+ - 🔍 RAG-powered context retrieval for cost effective automation with accuracy
23
+ - 📊 Comprehensive evaluation and visualization tools
24
+ - 🎯 Customizable extraction workflows
25
+ - 🌐 Knowledge graph generation
26
+
27
+ ## Installation
28
+
29
+ Install from PyPI:
30
+
31
+ ```bash
32
+ pip install comproscanner
33
+ ```
34
+
35
+ Or install from source:
36
+
37
+ ```bash
38
+ git clone https://github.com/slimeslab/ComProScanner.git
39
+ cd comproscanner
40
+ pip install -e .
41
+ ```
42
+
43
+ ## Quick Start
44
+
45
+ Here's a complete example extracting piezoelectric coefficient ($d_{33}$) data:
46
+
47
+ ```python
48
+ from comproscanner import ComProScanner
49
+
50
+ # Initialize scanner
51
+ scanner = ComProScanner(main_property_keyword="piezoelectric")
52
+
53
+ # Collect metadata
54
+ scanner.collect_metadata(
55
+ base_queries=["piezoelectric", "piezoelectricity"],
56
+ extra_queries=["ceramics", "applications"]
57
+ )
58
+
59
+ # Process articles
60
+ property_keywords = {
61
+ "exact_keywords": ["d33"],
62
+ "substring_keywords": [" d 33 "]
63
+ }
64
+
65
+ scanner.process_articles(
66
+ property_keywords=property_keywords,
67
+ source_list=["elsevier", "springer"]
68
+ )
69
+
70
+ # Extract composition-property data
71
+ scanner.extract_composition_property_data(
72
+ main_extraction_keyword="d33"
73
+ )
74
+ ```
75
+
76
+ ## Workflow
77
+
78
+ <div align="center">
79
+ <img src="assets/overall_workflow.png" alt="ComProScanner Workflow" width="750"/>
80
+ </div>
81
+
82
+ The ComProScanner workflow consists of four main stages:
83
+
84
+ 1. **Metadata Retrieval** - Find relevant scientific articles
85
+ 2. **Article Collection** - Extract full-text from various publishers
86
+ 3. **Information Extraction** - Use LLM agents to extract structured data
87
+ 4. **Post Processing & Dataset Creation** - Evaluate, clean, and visualize results
88
+
89
+ ## Documentation
90
+
91
+ 📖 **Full documentation is available at [slimeslab.github.io/ComProScanner](https://slimeslab.github.io/ComProScanner/)**
92
+
93
+ - [Installation Guide](https://slimeslab.github.io/ComProScanner/getting-started/installation/)
94
+ - [Quick Start Tutorial](https://slimeslab.github.io/ComProScanner/getting-started/quick-start/)
95
+ - [User Guide](https://slimeslab.github.io/ComProScanner/usage/metadata-collection/)
96
+ - [RAG Configuration](https://slimeslab.github.io/ComProScanner/rag-config/)
97
+
98
+ ## Core Capabilities
99
+
100
+ ### Supported Publishers
101
+
102
+ - **Elsevier** (via TDM API)
103
+ - **Springer Nature** (via TDM API)
104
+ - **Wiley** (via TDM API)
105
+ - **IOP Publishing** (via SFTP bulk access)
106
+ - **Local PDFs** (any publication)
107
+
108
+ ### Data Extraction
109
+
110
+ - Composition-property relationships
111
+ - Material families
112
+ - Synthesis methods and precursors
113
+ - Characterization techniques
114
+ - Synthesis steps
115
+
116
+ ### Evaluation Methods
117
+
118
+ - **Semantic Evaluation** - Using semantic similarity measures
119
+ - **Agentic Evaluation** - LLM-powered contextual analysis
120
+
121
+ ### Visualization
122
+
123
+ - Data Visualization
124
+ - Evaluation Visualization
125
+
126
+ ## Example Use Cases
127
+
128
+ ### Extract Data from Multiple Sources
129
+
130
+ ```python
131
+ scanner.process_articles(
132
+ property_keywords=property_keywords,
133
+ source_list=["elsevier", "springer", "wiley"]
134
+ )
135
+ ```
136
+
137
+ ### Customize RAG Configuration
138
+
139
+ ```python
140
+ scanner.extract_composition_property_data(
141
+ main_extraction_keyword="d33",
142
+ rag_chat_model="gemini-2.5-pro",
143
+ rag_max_tokens=2048,
144
+ rag_top_k=5
145
+ )
146
+ ```
147
+
148
+ ### Visualize Results
149
+
150
+ ```python
151
+ from comproscanner import data_visualizer, eval_visualizer
152
+
153
+ # Create knowledge graph
154
+ data_visualizer.create_knowledge_graph(result_file="results.json")
155
+
156
+ # Plot evaluation metrics
157
+ eval_visualizer.plot_multiple_radar_charts(
158
+ result_sources=["model1.json", "model2.json"],
159
+ model_names=["GPT-4o", "Claude-3.5"]
160
+ )
161
+ ```
162
+
163
+ ## Requirements
164
+
165
+ - Python 3.12 or 3.13
166
+ - TDM API keys for desired publishers (Elsevier, Springer, Wiley)
167
+ - LLM API keys (OpenAI, Anthropic, Google, etc.)
168
+ - Optional: Neo4j for knowledge graph visualization
169
+
170
+ ## Citation
171
+
172
+ If you use ComProScanner in your research, please cite:
173
+
174
+ ```bibtex
175
+ @misc{roy2025comproscanner,
176
+ title={ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature},
177
+ author={Aritra Roy and Enrico Grisan and John Buckeridge and Chiara Gattinoni},
178
+ year={2025},
179
+ eprint={example},
180
+ archivePrefix={arXiv},
181
+ primaryClass={cond-mat.mtrl-sci},
182
+ url={https://arxiv.org/abs/example},
183
+ }
184
+ ```
185
+
186
+ ## Contributing
187
+
188
+ We welcome contributions! Please see our [Contributing Guidelines](https://slimeslab.github.io/ComProScanner/about/contribution/) for details.
189
+
190
+ ## License
191
+
192
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
193
+
194
+ Copyright (c) 2025 SLIMES Lab
195
+
196
+ ## Contact
197
+
198
+ **Author:** Aritra Roy
199
+
200
+ - 🌐 Website: [aritraroy.live](https://aritraroy.live)
201
+ - 📧 Email: [contact@aritraroy.live](mailto:contact@aritraroy.live)
202
+ - 🐙 GitHub: [@aritraroy24](https://github.com/aritraroy24)
203
+ - 𝕏 Twitter: [@aritraroy24](https://twitter.com/aritraroy24)
204
+
205
+ **Project Links:**
206
+
207
+ - 📦 PyPI: [pypi.org/project/comproscanner](https://pypi.org/project/comproscanner/)
208
+ - 📖 Documentation: [slimeslab.github.io/ComProScanner](https://slimeslab.github.io/ComProScanner/)
209
+ - 🐛 Issues: [github.com/slimeslab/ComProScanner/issues](https://github.com/slimeslab/ComProScanner/issues)
210
+
211
+ ---
212
+
213
+ Made with ❤️ by [SLIMES Lab](https://slimeslab.github.io)
@@ -0,0 +1,64 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "comproscanner"
7
+ version = "0.1.0"
8
+ description = "Multi-agent system for extracting and processing structured composition-property data from scientific literature"
9
+ readme = "README.md"
10
+ authors = [{ name = "Aritra Roy", email = "contact@aritraroy.live" }]
11
+ license = { text = "MIT" }
12
+ requires-python = ">=3.12,<3.14"
13
+ dependencies = [
14
+ "requests",
15
+ "python-dotenv",
16
+ "tqdm",
17
+ "lxml",
18
+ "pandas",
19
+ "torch",
20
+ "langchain",
21
+ "transformers",
22
+ "tokenizers",
23
+ "mysql",
24
+ "mysql-connector",
25
+ "langchain_community",
26
+ "crewai",
27
+ "crewai-tools",
28
+ "chromadb",
29
+ "docling",
30
+ "sentence-transformers",
31
+ "neo4j",
32
+ "pymatgen",
33
+ "seaborn",
34
+ ]
35
+ classifiers = [
36
+ "Development Status :: 3 - Alpha",
37
+ "Intended Audience :: Developers",
38
+ "Intended Audience :: Science/Research",
39
+ "License :: OSI Approved :: MIT License",
40
+ "Operating System :: OS Independent",
41
+ "Programming Language :: Python :: 3",
42
+ "Programming Language :: Python :: 3.12",
43
+ "Topic :: Scientific/Engineering",
44
+ "Topic :: Scientific/Engineering :: Chemistry",
45
+ "Topic :: Scientific/Engineering :: Physics",
46
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
47
+ "Topic :: Scientific/Engineering :: Information Analysis",
48
+ "Topic :: Scientific/Engineering :: Visualization",
49
+ ]
50
+ [project.optional-dependencies]
51
+ dev = ["pytest", "pytest-mock"]
52
+ docs = ["mkdocs-material", "mkdocs-minify-plugin"]
53
+ [project.urls]
54
+ "Homepage" = "https://github.com/slimeslab/ComProScanner"
55
+ "Bug Tracker" = "https://github.com/slimeslab/ComProScanner/issues"
56
+ [tool.setuptools]
57
+ package-dir = { "" = "src" }
58
+
59
+ [tool.setuptools.packages.find]
60
+ where = ["src"]
61
+
62
+ [tool.pytest.ini_options]
63
+ testpaths = ["tests"]
64
+ python_files = "test_*.py"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+