comproscanner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. comproscanner/__init__.py +182 -0
  2. comproscanner/article_processors/__init__.py +0 -0
  3. comproscanner/article_processors/elsevier_processor.py +956 -0
  4. comproscanner/article_processors/iop_processor.py +721 -0
  5. comproscanner/article_processors/pdfs_processor.py +247 -0
  6. comproscanner/article_processors/springer_processor.py +1003 -0
  7. comproscanner/article_processors/wiley_processor.py +543 -0
  8. comproscanner/comproscanner.py +634 -0
  9. comproscanner/data_visualizer.py +498 -0
  10. comproscanner/eval_visualizer.py +1229 -0
  11. comproscanner/extract_flow/__init__.py +0 -0
  12. comproscanner/extract_flow/crews/composition_crew/composition_extraction_crew/composition_extraction_crew.py +140 -0
  13. comproscanner/extract_flow/crews/composition_crew/composition_format_crew/composition_format_crew.py +141 -0
  14. comproscanner/extract_flow/crews/materials_data_identifier_crew/materials_data_identifier_crew.py +145 -0
  15. comproscanner/extract_flow/crews/synthesis_crew/synthesis_extraction_crew/synthesis_extraction_crew.py +139 -0
  16. comproscanner/extract_flow/crews/synthesis_crew/synthesis_format_crew/synthesis_format_crew.py +138 -0
  17. comproscanner/extract_flow/main_extraction_flow.py +619 -0
  18. comproscanner/extract_flow/tools/__init__.py +0 -0
  19. comproscanner/extract_flow/tools/material_parser_tool.py +141 -0
  20. comproscanner/extract_flow/tools/rag_tool.py +214 -0
  21. comproscanner/metadata_extractor/__init__.py +0 -0
  22. comproscanner/metadata_extractor/fetch_metadata.py +546 -0
  23. comproscanner/metadata_extractor/filter_metadata.py +603 -0
  24. comproscanner/post_processing/data_cleaner.py +320 -0
  25. comproscanner/post_processing/evaluation/eval_flow/crews/composition_evaluation_crew/composition_evaluation_crew.py +111 -0
  26. comproscanner/post_processing/evaluation/eval_flow/crews/synthesis_evaluation_crew/synthesis_evaluation_crew.py +100 -0
  27. comproscanner/post_processing/evaluation/eval_flow/eval_flow.py +1685 -0
  28. comproscanner/post_processing/evaluation/semantic_evaluator.py +1541 -0
  29. comproscanner/post_processing/visualization/create_knowledge_graph.py +1003 -0
  30. comproscanner/post_processing/visualization/data_distribution_visualizers.py +1387 -0
  31. comproscanner/post_processing/visualization/eval_plot_visualizers.py +3681 -0
  32. comproscanner/utils/__init__.py +0 -0
  33. comproscanner/utils/common_functions.py +84 -0
  34. comproscanner/utils/configs/__init__.py +17 -0
  35. comproscanner/utils/configs/article_keywords.py +281 -0
  36. comproscanner/utils/configs/base_urls.py +21 -0
  37. comproscanner/utils/configs/custom_dictionary.py +77 -0
  38. comproscanner/utils/configs/database_config.py +34 -0
  39. comproscanner/utils/configs/llm_config.py +62 -0
  40. comproscanner/utils/configs/paths_config.py +20 -0
  41. comproscanner/utils/configs/rag_config.py +44 -0
  42. comproscanner/utils/data_preparator.py +419 -0
  43. comproscanner/utils/database_manager.py +248 -0
  44. comproscanner/utils/embeddings.py +200 -0
  45. comproscanner/utils/error_handler.py +92 -0
  46. comproscanner/utils/get_paper_data.py +353 -0
  47. comproscanner/utils/logger.py +117 -0
  48. comproscanner/utils/pdf_to_markdown_text.py +439 -0
  49. comproscanner/utils/prepare_iop_files.py +323 -0
  50. comproscanner/utils/save_results.py +112 -0
  51. comproscanner-0.1.0.dist-info/METADATA +265 -0
  52. comproscanner-0.1.0.dist-info/RECORD +55 -0
  53. comproscanner-0.1.0.dist-info/WHEEL +5 -0
  54. comproscanner-0.1.0.dist-info/licenses/LICENSE +21 -0
  55. comproscanner-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,182 @@
1
+ """
2
+ ComProScanner - A package for extracting composition-property data from scientific articles.
3
+
4
+ This package provides tools to collect metadata, process articles from various sources,
5
+ extract composition-property relationships, evaluate extraction performance and visualize data distribution.
6
+
7
+ Main functions:
8
+ - collect_metadata: Collect and filter metadata from scientific articles
9
+ - process_articles: Process articles from various sources (Elsevier, Wiley, etc.)
10
+ - extract_composition_property_data: Extract composition-property relationships from articles
11
+ - evaluate_semantic: Evaluate extraction quality using semantic similarity
12
+ - evaluate_agentic: Evaluate extraction quality using agent-based methods
13
+ - create_knowledge_graph: Create knowledge graph from extracted data
14
+ """
15
+
16
+ # Import the main class
17
+ from .comproscanner import ComProScanner
18
+
19
+ # Import core configuration classes
20
+ from .utils.configs.rag_config import RAGConfig
21
+ from .utils.configs.llm_config import LLMConfig
22
+
23
+ # Import visualization module
24
+ from . import eval_visualizer
25
+ from . import data_visualizer
26
+
27
+ # Package version
28
+ __version__ = "0.1.0"
29
+
30
+ # Importing options for "from comproscanner import *"
31
+ __all__ = [
32
+ "ComProScanner",
33
+ "collect_metadata",
34
+ "process_articles",
35
+ "extract_composition_property_data",
36
+ "evaluate_semantic",
37
+ "evaluate_agentic",
38
+ "create_knowledge_graph",
39
+ "RAGConfig",
40
+ "LLMConfig",
41
+ "eval_visualizer",
42
+ "data_visualizer",
43
+ ]
44
+
45
+
46
+ def collect_metadata(
47
+ main_property_keyword,
48
+ base_queries=None,
49
+ extra_queries=None,
50
+ start_year=None,
51
+ end_year=None,
52
+ ):
53
+ """
54
+ Collect metadata from scientific articles.
55
+
56
+ Args:
57
+ main_property_keyword (str): The main property keyword to search for
58
+ base_queries (list, optional): List of base queries to search for
59
+ extra_queries (list, optional): List of extra queries to search for
60
+ start_year (int, optional): Start year for the search
61
+ end_year (int, optional): End year for the search
62
+ """
63
+ scanner = ComProScanner(main_property_keyword=main_property_keyword)
64
+ return scanner.collect_metadata(
65
+ base_queries=base_queries,
66
+ extra_queries=extra_queries,
67
+ start_year=start_year,
68
+ end_year=end_year,
69
+ )
70
+
71
+
72
+ def process_articles(
73
+ main_property_keyword,
74
+ property_keywords=None,
75
+ source_list=["elsevier", "wiley", "iop", "springer", "pdfs"],
76
+ **kwargs,
77
+ ):
78
+ """
79
+ Process articles for the main property keyword.
80
+
81
+ Args:
82
+ main_property_keyword (str): The main property keyword to search for
83
+ property_keywords (dict): Dictionary of property keywords for filtering
84
+ source_list (list, optional): List of sources to process
85
+ **kwargs: Additional arguments to pass to the process_articles method
86
+ """
87
+ scanner = ComProScanner(main_property_keyword=main_property_keyword)
88
+ return scanner.process_articles(
89
+ property_keywords=property_keywords, source_list=source_list, **kwargs
90
+ )
91
+
92
+
93
+ def extract_composition_property_data(
94
+ main_property_keyword, main_extraction_keyword=None, **kwargs
95
+ ):
96
+ """
97
+ Extract composition-property data from articles.
98
+
99
+ Args:
100
+ main_property_keyword (str): The main property keyword
101
+ main_extraction_keyword (str): The main keyword to extract data for
102
+ **kwargs: Additional arguments to pass to the extract_composition_property_data method
103
+ """
104
+ scanner = ComProScanner(main_property_keyword=main_property_keyword)
105
+ return scanner.extract_composition_property_data(
106
+ main_extraction_keyword=main_extraction_keyword, **kwargs
107
+ )
108
+
109
+
110
+ def evaluate_semantic(
111
+ ground_truth_file=None,
112
+ test_data_file=None,
113
+ weights=None,
114
+ output_file="semantic_evaluation_result.json",
115
+ extraction_agent_model_name="gpt-4o-mini",
116
+ is_synthesis_evaluation=True,
117
+ use_semantic_model=True,
118
+ primary_model_name="thellert/physbert_cased",
119
+ fallback_model_name="all-mpnet-base-v2",
120
+ similarity_thresholds=None,
121
+ ):
122
+ """
123
+ Evaluate the extracted data using semantic evaluation.
124
+
125
+ Args:
126
+ ground_truth_file (str, optional): Path to the ground truth file. Defaults to None.
127
+ test_data_file (str, optional): Path to the test data file. Defaults to None.
128
+ weights (dict, optional): Weights for the evaluation metrics. Defaults to None.
129
+ output_file (str, optional): Path to the output file for saving the evaluation results. Defaults to "semantic_evaluation_result.json".
130
+ extraction_agent_model_name (str, optional): Name of the agent model used for extraction. Defaults to "GPT-4o-mini".
131
+ is_synthesis_evaluation (bool, optional): A flag to indicate if synthesis evaluation is required. Defaults to True.
132
+ use_semantic_model (bool, optional): A flag to indicate if semantic model should be used for evaluation. Defaults to True.
133
+ primary_model_name (str, optional): Name of the primary model for semantic evaluation. Defaults to "thellert/physbert_cased".
134
+ fallback_model_name (str, optional): Name of the fallback model for semantic evaluation. Defaults to "all-mpnet-base-v2".
135
+ similarity_thresholds (dict, optional): Similarity thresholds for evaluation. Defaults to 0.8 for each metric.
136
+ """
137
+ scanner = ComProScanner(main_property_keyword="placeholder")
138
+ return scanner.evaluate_semantic(
139
+ ground_truth_file=ground_truth_file,
140
+ test_data_file=test_data_file,
141
+ weights=weights,
142
+ output_file=output_file,
143
+ agent_model_name=extraction_agent_model_name,
144
+ is_synthesis_evaluation=is_synthesis_evaluation,
145
+ use_semantic_model=use_semantic_model,
146
+ primary_model_name=primary_model_name,
147
+ fallback_model_name=fallback_model_name,
148
+ similarity_thresholds=similarity_thresholds,
149
+ )
150
+
151
+
152
+ def evaluate_agentic(
153
+ ground_truth_file=None,
154
+ test_data_file=None,
155
+ output_file="detailed_evaluation.json",
156
+ extraction_agent_model_name="gpt-4o-mini",
157
+ is_synthesis_evaluation=True,
158
+ weights=None,
159
+ llm=None,
160
+ ):
161
+ """
162
+ Evaluate the extracted data using agentic evaluation.
163
+
164
+ Args:
165
+ ground_truth_file (str, optional): Path to the ground truth file. Defaults to None.
166
+ test_data_file (str, optional): Path to the test data file. Defaults to None.
167
+ output_file (str, optional): Path to the output file for saving the evaluation results. Defaults to "detailed_evaluation.json".
168
+ extraction_agent_model_name (str, optional): Name of the agent model used for extraction. Defaults to "GPT-4o-mini".
169
+ is_synthesis_evaluation (bool, optional): A flag to indicate if synthesis evaluation is required. Defaults to True.
170
+ weights (dict, optional): Weights for the evaluation metrics. Defaults to None.
171
+ llm (LLM, optional): An instance of the LLM class. Defaults to None.
172
+ """
173
+ scanner = ComProScanner(main_property_keyword="placeholder")
174
+ return scanner.evaluate_agentic(
175
+ ground_truth_file=ground_truth_file,
176
+ test_data_file=test_data_file,
177
+ output_file=output_file,
178
+ agent_model_name=extraction_agent_model_name,
179
+ is_synthesis_evaluation=is_synthesis_evaluation,
180
+ weights=weights,
181
+ llm=llm,
182
+ )
File without changes