comproscanner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- comproscanner/__init__.py +182 -0
- comproscanner/article_processors/__init__.py +0 -0
- comproscanner/article_processors/elsevier_processor.py +956 -0
- comproscanner/article_processors/iop_processor.py +721 -0
- comproscanner/article_processors/pdfs_processor.py +247 -0
- comproscanner/article_processors/springer_processor.py +1003 -0
- comproscanner/article_processors/wiley_processor.py +543 -0
- comproscanner/comproscanner.py +634 -0
- comproscanner/data_visualizer.py +498 -0
- comproscanner/eval_visualizer.py +1229 -0
- comproscanner/extract_flow/__init__.py +0 -0
- comproscanner/extract_flow/crews/composition_crew/composition_extraction_crew/composition_extraction_crew.py +140 -0
- comproscanner/extract_flow/crews/composition_crew/composition_format_crew/composition_format_crew.py +141 -0
- comproscanner/extract_flow/crews/materials_data_identifier_crew/materials_data_identifier_crew.py +145 -0
- comproscanner/extract_flow/crews/synthesis_crew/synthesis_extraction_crew/synthesis_extraction_crew.py +139 -0
- comproscanner/extract_flow/crews/synthesis_crew/synthesis_format_crew/synthesis_format_crew.py +138 -0
- comproscanner/extract_flow/main_extraction_flow.py +619 -0
- comproscanner/extract_flow/tools/__init__.py +0 -0
- comproscanner/extract_flow/tools/material_parser_tool.py +141 -0
- comproscanner/extract_flow/tools/rag_tool.py +214 -0
- comproscanner/metadata_extractor/__init__.py +0 -0
- comproscanner/metadata_extractor/fetch_metadata.py +546 -0
- comproscanner/metadata_extractor/filter_metadata.py +603 -0
- comproscanner/post_processing/data_cleaner.py +320 -0
- comproscanner/post_processing/evaluation/eval_flow/crews/composition_evaluation_crew/composition_evaluation_crew.py +111 -0
- comproscanner/post_processing/evaluation/eval_flow/crews/synthesis_evaluation_crew/synthesis_evaluation_crew.py +100 -0
- comproscanner/post_processing/evaluation/eval_flow/eval_flow.py +1685 -0
- comproscanner/post_processing/evaluation/semantic_evaluator.py +1541 -0
- comproscanner/post_processing/visualization/create_knowledge_graph.py +1003 -0
- comproscanner/post_processing/visualization/data_distribution_visualizers.py +1387 -0
- comproscanner/post_processing/visualization/eval_plot_visualizers.py +3681 -0
- comproscanner/utils/__init__.py +0 -0
- comproscanner/utils/common_functions.py +84 -0
- comproscanner/utils/configs/__init__.py +17 -0
- comproscanner/utils/configs/article_keywords.py +281 -0
- comproscanner/utils/configs/base_urls.py +21 -0
- comproscanner/utils/configs/custom_dictionary.py +77 -0
- comproscanner/utils/configs/database_config.py +34 -0
- comproscanner/utils/configs/llm_config.py +62 -0
- comproscanner/utils/configs/paths_config.py +20 -0
- comproscanner/utils/configs/rag_config.py +44 -0
- comproscanner/utils/data_preparator.py +419 -0
- comproscanner/utils/database_manager.py +248 -0
- comproscanner/utils/embeddings.py +200 -0
- comproscanner/utils/error_handler.py +92 -0
- comproscanner/utils/get_paper_data.py +353 -0
- comproscanner/utils/logger.py +117 -0
- comproscanner/utils/pdf_to_markdown_text.py +439 -0
- comproscanner/utils/prepare_iop_files.py +323 -0
- comproscanner/utils/save_results.py +112 -0
- comproscanner-0.1.0.dist-info/METADATA +265 -0
- comproscanner-0.1.0.dist-info/RECORD +55 -0
- comproscanner-0.1.0.dist-info/WHEEL +5 -0
- comproscanner-0.1.0.dist-info/licenses/LICENSE +21 -0
- comproscanner-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ComProScanner - A package for extracting composition-property data from scientific articles.
|
|
3
|
+
|
|
4
|
+
This package provides tools to collect metadata, process articles from various sources,
|
|
5
|
+
extract composition-property relationships, evaluate extraction performance and visualize data distribution.
|
|
6
|
+
|
|
7
|
+
Main functions:
|
|
8
|
+
- collect_metadata: Collect and filter metadata from scientific articles
|
|
9
|
+
- process_articles: Process articles from various sources (Elsevier, Wiley, etc.)
|
|
10
|
+
- extract_composition_property_data: Extract composition-property relationships from articles
|
|
11
|
+
- evaluate_semantic: Evaluate extraction quality using semantic similarity
|
|
12
|
+
- evaluate_agentic: Evaluate extraction quality using agent-based methods
|
|
13
|
+
- create_knowledge_graph: Create knowledge graph from extracted data
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Import the main class
|
|
17
|
+
from .comproscanner import ComProScanner
|
|
18
|
+
|
|
19
|
+
# Import core configuration classes
|
|
20
|
+
from .utils.configs.rag_config import RAGConfig
|
|
21
|
+
from .utils.configs.llm_config import LLMConfig
|
|
22
|
+
|
|
23
|
+
# Import visualization module
|
|
24
|
+
from . import eval_visualizer
|
|
25
|
+
from . import data_visualizer
|
|
26
|
+
|
|
27
|
+
# Package version
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
# Importing options for "from comproscanner import *"
|
|
31
|
+
__all__ = [
|
|
32
|
+
"ComProScanner",
|
|
33
|
+
"collect_metadata",
|
|
34
|
+
"process_articles",
|
|
35
|
+
"extract_composition_property_data",
|
|
36
|
+
"evaluate_semantic",
|
|
37
|
+
"evaluate_agentic",
|
|
38
|
+
"create_knowledge_graph",
|
|
39
|
+
"RAGConfig",
|
|
40
|
+
"LLMConfig",
|
|
41
|
+
"eval_visualizer",
|
|
42
|
+
"data_visualizer",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def collect_metadata(
|
|
47
|
+
main_property_keyword,
|
|
48
|
+
base_queries=None,
|
|
49
|
+
extra_queries=None,
|
|
50
|
+
start_year=None,
|
|
51
|
+
end_year=None,
|
|
52
|
+
):
|
|
53
|
+
"""
|
|
54
|
+
Collect metadata from scientific articles.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
main_property_keyword (str): The main property keyword to search for
|
|
58
|
+
base_queries (list, optional): List of base queries to search for
|
|
59
|
+
extra_queries (list, optional): List of extra queries to search for
|
|
60
|
+
start_year (int, optional): Start year for the search
|
|
61
|
+
end_year (int, optional): End year for the search
|
|
62
|
+
"""
|
|
63
|
+
scanner = ComProScanner(main_property_keyword=main_property_keyword)
|
|
64
|
+
return scanner.collect_metadata(
|
|
65
|
+
base_queries=base_queries,
|
|
66
|
+
extra_queries=extra_queries,
|
|
67
|
+
start_year=start_year,
|
|
68
|
+
end_year=end_year,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def process_articles(
|
|
73
|
+
main_property_keyword,
|
|
74
|
+
property_keywords=None,
|
|
75
|
+
source_list=["elsevier", "wiley", "iop", "springer", "pdfs"],
|
|
76
|
+
**kwargs,
|
|
77
|
+
):
|
|
78
|
+
"""
|
|
79
|
+
Process articles for the main property keyword.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
main_property_keyword (str): The main property keyword to search for
|
|
83
|
+
property_keywords (dict): Dictionary of property keywords for filtering
|
|
84
|
+
source_list (list, optional): List of sources to process
|
|
85
|
+
**kwargs: Additional arguments to pass to the process_articles method
|
|
86
|
+
"""
|
|
87
|
+
scanner = ComProScanner(main_property_keyword=main_property_keyword)
|
|
88
|
+
return scanner.process_articles(
|
|
89
|
+
property_keywords=property_keywords, source_list=source_list, **kwargs
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def extract_composition_property_data(
|
|
94
|
+
main_property_keyword, main_extraction_keyword=None, **kwargs
|
|
95
|
+
):
|
|
96
|
+
"""
|
|
97
|
+
Extract composition-property data from articles.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
main_property_keyword (str): The main property keyword
|
|
101
|
+
main_extraction_keyword (str): The main keyword to extract data for
|
|
102
|
+
**kwargs: Additional arguments to pass to the extract_composition_property_data method
|
|
103
|
+
"""
|
|
104
|
+
scanner = ComProScanner(main_property_keyword=main_property_keyword)
|
|
105
|
+
return scanner.extract_composition_property_data(
|
|
106
|
+
main_extraction_keyword=main_extraction_keyword, **kwargs
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def evaluate_semantic(
|
|
111
|
+
ground_truth_file=None,
|
|
112
|
+
test_data_file=None,
|
|
113
|
+
weights=None,
|
|
114
|
+
output_file="semantic_evaluation_result.json",
|
|
115
|
+
extraction_agent_model_name="gpt-4o-mini",
|
|
116
|
+
is_synthesis_evaluation=True,
|
|
117
|
+
use_semantic_model=True,
|
|
118
|
+
primary_model_name="thellert/physbert_cased",
|
|
119
|
+
fallback_model_name="all-mpnet-base-v2",
|
|
120
|
+
similarity_thresholds=None,
|
|
121
|
+
):
|
|
122
|
+
"""
|
|
123
|
+
Evaluate the extracted data using semantic evaluation.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
ground_truth_file (str, optional): Path to the ground truth file. Defaults to None.
|
|
127
|
+
test_data_file (str, optional): Path to the test data file. Defaults to None.
|
|
128
|
+
weights (dict, optional): Weights for the evaluation metrics. Defaults to None.
|
|
129
|
+
output_file (str, optional): Path to the output file for saving the evaluation results. Defaults to "semantic_evaluation_result.json".
|
|
130
|
+
extraction_agent_model_name (str, optional): Name of the agent model used for extraction. Defaults to "GPT-4o-mini".
|
|
131
|
+
is_synthesis_evaluation (bool, optional): A flag to indicate if synthesis evaluation is required. Defaults to True.
|
|
132
|
+
use_semantic_model (bool, optional): A flag to indicate if semantic model should be used for evaluation. Defaults to True.
|
|
133
|
+
primary_model_name (str, optional): Name of the primary model for semantic evaluation. Defaults to "thellert/physbert_cased".
|
|
134
|
+
fallback_model_name (str, optional): Name of the fallback model for semantic evaluation. Defaults to "all-mpnet-base-v2".
|
|
135
|
+
similarity_thresholds (dict, optional): Similarity thresholds for evaluation. Defaults to 0.8 for each metric.
|
|
136
|
+
"""
|
|
137
|
+
scanner = ComProScanner(main_property_keyword="placeholder")
|
|
138
|
+
return scanner.evaluate_semantic(
|
|
139
|
+
ground_truth_file=ground_truth_file,
|
|
140
|
+
test_data_file=test_data_file,
|
|
141
|
+
weights=weights,
|
|
142
|
+
output_file=output_file,
|
|
143
|
+
agent_model_name=extraction_agent_model_name,
|
|
144
|
+
is_synthesis_evaluation=is_synthesis_evaluation,
|
|
145
|
+
use_semantic_model=use_semantic_model,
|
|
146
|
+
primary_model_name=primary_model_name,
|
|
147
|
+
fallback_model_name=fallback_model_name,
|
|
148
|
+
similarity_thresholds=similarity_thresholds,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def evaluate_agentic(
|
|
153
|
+
ground_truth_file=None,
|
|
154
|
+
test_data_file=None,
|
|
155
|
+
output_file="detailed_evaluation.json",
|
|
156
|
+
extraction_agent_model_name="gpt-4o-mini",
|
|
157
|
+
is_synthesis_evaluation=True,
|
|
158
|
+
weights=None,
|
|
159
|
+
llm=None,
|
|
160
|
+
):
|
|
161
|
+
"""
|
|
162
|
+
Evaluate the extracted data using agentic evaluation.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
ground_truth_file (str, optional): Path to the ground truth file. Defaults to None.
|
|
166
|
+
test_data_file (str, optional): Path to the test data file. Defaults to None.
|
|
167
|
+
output_file (str, optional): Path to the output file for saving the evaluation results. Defaults to "detailed_evaluation.json".
|
|
168
|
+
extraction_agent_model_name (str, optional): Name of the agent model used for extraction. Defaults to "GPT-4o-mini".
|
|
169
|
+
is_synthesis_evaluation (bool, optional): A flag to indicate if synthesis evaluation is required. Defaults to True.
|
|
170
|
+
weights (dict, optional): Weights for the evaluation metrics. Defaults to None.
|
|
171
|
+
llm (LLM, optional): An instance of the LLM class. Defaults to None.
|
|
172
|
+
"""
|
|
173
|
+
scanner = ComProScanner(main_property_keyword="placeholder")
|
|
174
|
+
return scanner.evaluate_agentic(
|
|
175
|
+
ground_truth_file=ground_truth_file,
|
|
176
|
+
test_data_file=test_data_file,
|
|
177
|
+
output_file=output_file,
|
|
178
|
+
agent_model_name=extraction_agent_model_name,
|
|
179
|
+
is_synthesis_evaluation=is_synthesis_evaluation,
|
|
180
|
+
weights=weights,
|
|
181
|
+
llm=llm,
|
|
182
|
+
)
|
|
File without changes
|