bibexpy 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bibex_core/MergeDB.py +1189 -0
- bibex_core/__init__.py +13 -0
- bibex_core/modules/__init__.py +35 -0
- bibex_core/modules/api_utils.py +1172 -0
- bibex_core/modules/c1_utils.py +423 -0
- bibex_core/modules/file_utils.py +71 -0
- bibex_core/modules/ml_utils.py +332 -0
- bibex_core/modules/post_process.py +50 -0
- bibex_core/modules/stats_utils.py +216 -0
- bibex_core/scp2xlsx.py +235 -0
- bibex_core/wos2xlsx.py +159 -0
- bibex_core/xlsx2vos.py +185 -0
- bibexpy/__init__.py +9 -0
- bibexpy/__main__.py +6 -0
- bibexpy/_server/__init__.py +1 -0
- bibexpy/_server/config.py +92 -0
- bibexpy/_server/jobs/__init__.py +3 -0
- bibexpy/_server/jobs/runner.py +301 -0
- bibexpy/_server/main.py +211 -0
- bibexpy/_server/models/__init__.py +0 -0
- bibexpy/_server/models/project.py +25 -0
- bibexpy/_server/routers/__init__.py +0 -0
- bibexpy/_server/routers/audit.py +65 -0
- bibexpy/_server/routers/convert.py +101 -0
- bibexpy/_server/routers/disambiguate.py +177 -0
- bibexpy/_server/routers/downloads.py +26 -0
- bibexpy/_server/routers/enrich.py +42 -0
- bibexpy/_server/routers/export.py +116 -0
- bibexpy/_server/routers/export_folder.py +97 -0
- bibexpy/_server/routers/filter.py +131 -0
- bibexpy/_server/routers/jobs.py +72 -0
- bibexpy/_server/routers/merge.py +437 -0
- bibexpy/_server/routers/prepare.py +241 -0
- bibexpy/_server/routers/projects.py +49 -0
- bibexpy/_server/routers/quality.py +288 -0
- bibexpy/_server/routers/records.py +204 -0
- bibexpy/_server/routers/report.py +111 -0
- bibexpy/_server/routers/settings.py +383 -0
- bibexpy/_server/routers/system.py +209 -0
- bibexpy/_server/routers/tools.py +257 -0
- bibexpy/_server/routers/upload.py +110 -0
- bibexpy/_server/services/__init__.py +0 -0
- bibexpy/_server/services/analyses.py +364 -0
- bibexpy/_server/services/audit.py +340 -0
- bibexpy/_server/services/bibex_adapter.py +47 -0
- bibexpy/_server/services/bibtex_writer.py +104 -0
- bibexpy/_server/services/converter.py +205 -0
- bibexpy/_server/services/disambiguation/__init__.py +19 -0
- bibexpy/_server/services/disambiguation/blocking.py +581 -0
- bibexpy/_server/services/disambiguation/cache.py +47 -0
- bibexpy/_server/services/disambiguation/deepseek_client.py +203 -0
- bibexpy/_server/services/disambiguation/orcid.py +261 -0
- bibexpy/_server/services/disambiguation/pipeline.py +985 -0
- bibexpy/_server/services/disambiguation/similarity.py +91 -0
- bibexpy/_server/services/enricher.py +338 -0
- bibexpy/_server/services/exporter.py +112 -0
- bibexpy/_server/services/filter_engine.py +300 -0
- bibexpy/_server/services/merger.py +202 -0
- bibexpy/_server/services/methodology.py +102 -0
- bibexpy/_server/services/report_export.py +300 -0
- bibexpy/_server/services/ris_writer.py +62 -0
- bibexpy/_server/services/smart_merger.py +967 -0
- bibexpy/_server/services/storage.py +165 -0
- bibexpy/_web/404/index.html +1 -0
- bibexpy/_web/404.html +1 -0
- bibexpy/_web/_next/static/chunks/117-7ad5186126fa076d.js +2 -0
- bibexpy/_web/_next/static/chunks/156-75beffb8ae92a7f3.js +1 -0
- bibexpy/_web/_next/static/chunks/169-301c04fcec22686f.js +4 -0
- bibexpy/_web/_next/static/chunks/2-ecc1c36a5632ac61.js +1 -0
- bibexpy/_web/_next/static/chunks/244-be343e0f8aa00fb5.js +1 -0
- bibexpy/_web/_next/static/chunks/367-b7457c2b95f6932b.js +1 -0
- bibexpy/_web/_next/static/chunks/382-9cd163eed632e241.js +1 -0
- bibexpy/_web/_next/static/chunks/392-afa8bfa4c1a9893d.js +1 -0
- bibexpy/_web/_next/static/chunks/507-63607ded9f83c1aa.js +1 -0
- bibexpy/_web/_next/static/chunks/624-cae565632f47217d.js +1 -0
- bibexpy/_web/_next/static/chunks/739-a00c6b9c0f11813e.js +1 -0
- bibexpy/_web/_next/static/chunks/862-8e1bd5f93c082c02.js +1 -0
- bibexpy/_web/_next/static/chunks/app/_not-found/page-873ba59b5eb7c342.js +1 -0
- bibexpy/_web/_next/static/chunks/app/layout-0d5fc68ac8971a04.js +1 -0
- bibexpy/_web/_next/static/chunks/app/page-cf2bea5edc74459a.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/convert/page-ab6dd645ae113b64.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/disambiguate/page-dd98482cc47344e9.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/enrich/page-4727e431468514e9.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/export/page-f9005d164bb22cb0.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/layout-07755fda55b3ae1f.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/merge/page-45b95318ea7beccf.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/records/page-bfad80457da9eb0b.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/report/page-f05767e74158e995.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/[id]/upload/page-8df78ca9cdecbcb8.js +1 -0
- bibexpy/_web/_next/static/chunks/app/projects/page-99960711ac6a13e9.js +1 -0
- bibexpy/_web/_next/static/chunks/app/settings/page-002df809d6c5a227.js +1 -0
- bibexpy/_web/_next/static/chunks/app/tools/page-27eee1c6a590859f.js +1 -0
- bibexpy/_web/_next/static/chunks/fd9d1056-f2718be92181f426.js +1 -0
- bibexpy/_web/_next/static/chunks/framework-f66176bb897dc684.js +1 -0
- bibexpy/_web/_next/static/chunks/main-app-d138a6e2f8c847f6.js +1 -0
- bibexpy/_web/_next/static/chunks/main-c4b9ddbf514503ef.js +1 -0
- bibexpy/_web/_next/static/chunks/pages/_app-72b849fbd24ac258.js +1 -0
- bibexpy/_web/_next/static/chunks/pages/_error-7ba65e1336b92748.js +1 -0
- bibexpy/_web/_next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
- bibexpy/_web/_next/static/chunks/webpack-c81f7fd28659d64f.js +1 -0
- bibexpy/_web/_next/static/css/1c4638f3e39cf127.css +3 -0
- bibexpy/_web/_next/static/ypx2IQAkZiSaFiKWnzZGB/_buildManifest.js +1 -0
- bibexpy/_web/_next/static/ypx2IQAkZiSaFiKWnzZGB/_ssgManifest.js +1 -0
- bibexpy/_web/images/PROMPTS.md +83 -0
- bibexpy/_web/images/about-illustration.png +0 -0
- bibexpy/_web/images/authors/alperen-sahin.jpg +0 -0
- bibexpy/_web/images/authors/burak-can-kara.jpg +0 -0
- bibexpy/_web/images/authors/taskin-dirsehan.jpg +0 -0
- bibexpy/_web/images/bibexpy-logo-full.png +0 -0
- bibexpy/_web/images/bibexpy-logo-header.png +0 -0
- bibexpy/_web/images/bibexpy-logo-in.png +0 -0
- bibexpy/_web/images/bibexpy-logo.webp +0 -0
- bibexpy/_web/images/hero-illustration.png +0 -0
- bibexpy/_web/images/loader.gif +0 -0
- bibexpy/_web/images/workflow-diagram.png +0 -0
- bibexpy/_web/index.html +1 -0
- bibexpy/_web/index.txt +11 -0
- bibexpy/_web/projects/_/convert/index.html +1 -0
- bibexpy/_web/projects/_/convert/index.txt +12 -0
- bibexpy/_web/projects/_/disambiguate/index.html +1 -0
- bibexpy/_web/projects/_/disambiguate/index.txt +12 -0
- bibexpy/_web/projects/_/enrich/index.html +1 -0
- bibexpy/_web/projects/_/enrich/index.txt +12 -0
- bibexpy/_web/projects/_/export/index.html +1 -0
- bibexpy/_web/projects/_/export/index.txt +12 -0
- bibexpy/_web/projects/_/merge/index.html +1 -0
- bibexpy/_web/projects/_/merge/index.txt +12 -0
- bibexpy/_web/projects/_/records/index.html +1 -0
- bibexpy/_web/projects/_/records/index.txt +12 -0
- bibexpy/_web/projects/_/report/index.html +1 -0
- bibexpy/_web/projects/_/report/index.txt +12 -0
- bibexpy/_web/projects/_/upload/index.html +1 -0
- bibexpy/_web/projects/_/upload/index.txt +12 -0
- bibexpy/_web/projects/index.html +1 -0
- bibexpy/_web/projects/index.txt +11 -0
- bibexpy/_web/settings/index.html +1 -0
- bibexpy/_web/settings/index.txt +11 -0
- bibexpy/_web/tools/README.md +65 -0
- bibexpy/_web/tools/bibexcel.svg +6 -0
- bibexpy/_web/tools/bibliometrix.png +0 -0
- bibexpy/_web/tools/biblioshiny.svg +12 -0
- bibexpy/_web/tools/citavi.svg +6 -0
- bibexpy/_web/tools/citespace.svg +7 -0
- bibexpy/_web/tools/citnetexplorer.svg +8 -0
- bibexpy/_web/tools/endnote.svg +6 -0
- bibexpy/_web/tools/excel.svg +1 -0
- bibexpy/_web/tools/gephi.svg +66 -0
- bibexpy/_web/tools/histcite.svg +7 -0
- bibexpy/_web/tools/index.html +1 -0
- bibexpy/_web/tools/index.txt +11 -0
- bibexpy/_web/tools/jabref.svg +122 -0
- bibexpy/_web/tools/latex.svg +1 -0
- bibexpy/_web/tools/mendeley.svg +1 -0
- bibexpy/_web/tools/openrefine.svg +15 -0
- bibexpy/_web/tools/overleaf.svg +1 -0
- bibexpy/_web/tools/papers.svg +7 -0
- bibexpy/_web/tools/powerbi.svg +1 -0
- bibexpy/_web/tools/python.svg +1 -0
- bibexpy/_web/tools/r.svg +1 -0
- bibexpy/_web/tools/refworks.svg +5 -0
- bibexpy/_web/tools/scite.svg +5 -0
- bibexpy/_web/tools/tableau.svg +1 -0
- bibexpy/_web/tools/vosviewer.png +0 -0
- bibexpy/_web/tools/zotero.svg +1 -0
- bibexpy/cli.py +242 -0
- bibexpy-2.0.0.dist-info/METADATA +105 -0
- bibexpy-2.0.0.dist-info/RECORD +171 -0
- bibexpy-2.0.0.dist-info/WHEEL +5 -0
- bibexpy-2.0.0.dist-info/entry_points.txt +2 -0
- bibexpy-2.0.0.dist-info/licenses/LICENSE +674 -0
- bibexpy-2.0.0.dist-info/top_level.txt +2 -0
bibex_core/MergeDB.py
ADDED
|
@@ -0,0 +1,1189 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import re
|
|
3
|
+
import os
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
import numpy as np
|
|
6
|
+
from unidecode import unidecode
|
|
7
|
+
|
|
8
|
+
def trim(text: str) -> str:
|
|
9
|
+
"""Removes extra spaces from text"""
|
|
10
|
+
if pd.isna(text):
|
|
11
|
+
return ""
|
|
12
|
+
return re.sub(r'\s+', ' ', str(text)).strip()
|
|
13
|
+
|
|
14
|
+
def merge_values(x):
|
|
15
|
+
"""
|
|
16
|
+
Merges values from a pandas Series, handling NaN values and duplicates.
|
|
17
|
+
Used for combining values during database merging.
|
|
18
|
+
"""
|
|
19
|
+
if x.empty:
|
|
20
|
+
return ""
|
|
21
|
+
# Get first non-NaN value
|
|
22
|
+
values = [str(val) for val in x if pd.notna(val)]
|
|
23
|
+
if not values:
|
|
24
|
+
return ""
|
|
25
|
+
return values[0]
|
|
26
|
+
|
|
27
|
+
def meta_tag_extraction(df: pd.DataFrame, tag: str) -> pd.DataFrame:
|
|
28
|
+
"""Creates SR (Source) tag"""
|
|
29
|
+
if 'AU' in df.columns and 'PY' in df.columns:
|
|
30
|
+
def _sr(row):
|
|
31
|
+
au = row['AU']
|
|
32
|
+
py = row['PY']
|
|
33
|
+
# NaN / None / boş AU varsa güvenli fallback
|
|
34
|
+
if pd.isna(au) or au is None:
|
|
35
|
+
au_first = ""
|
|
36
|
+
else:
|
|
37
|
+
au_str = str(au)
|
|
38
|
+
au_first = au_str.split(';')[0].strip() if au_str else ""
|
|
39
|
+
py_str = "" if pd.isna(py) else str(py)
|
|
40
|
+
return f"{au_first} {py_str}".strip()
|
|
41
|
+
df['SR'] = df.apply(_sr, axis=1)
|
|
42
|
+
return df
|
|
43
|
+
|
|
44
|
+
def clean_merged_values(x: str) -> str:
|
|
45
|
+
"""Clean merged values by removing extra semicolons, spaces and duplicates"""
|
|
46
|
+
if not isinstance(x, str):
|
|
47
|
+
return x
|
|
48
|
+
|
|
49
|
+
# Split by semicolon and clean each part
|
|
50
|
+
parts = [part.strip() for part in x.split(';')]
|
|
51
|
+
|
|
52
|
+
# Remove empty parts and duplicates while preserving order
|
|
53
|
+
seen = set()
|
|
54
|
+
cleaned_parts = []
|
|
55
|
+
for part in parts:
|
|
56
|
+
if part and part not in seen:
|
|
57
|
+
seen.add(part)
|
|
58
|
+
cleaned_parts.append(part)
|
|
59
|
+
|
|
60
|
+
# Join back with semicolon
|
|
61
|
+
return '; '.join(cleaned_parts)
|
|
62
|
+
|
|
63
|
+
def merge_author_fields(wos_authors: str, scopus_authors: str) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Merges author fields using WoS format as reference
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
wos_authors (str): Author list from WoS
|
|
69
|
+
scopus_authors (str): Author list from Scopus
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
str: Merged author list in WoS format
|
|
73
|
+
"""
|
|
74
|
+
def normalize_author(author):
|
|
75
|
+
# Clean spaces
|
|
76
|
+
author = re.sub(r'\s+', ' ', author.strip())
|
|
77
|
+
# Normalize special characters
|
|
78
|
+
author = unidecode(author)
|
|
79
|
+
# Convert to uppercase
|
|
80
|
+
return author.upper()
|
|
81
|
+
|
|
82
|
+
def get_author_key(author):
|
|
83
|
+
# Create key for author matching
|
|
84
|
+
parts = normalize_author(author).split()
|
|
85
|
+
if not parts:
|
|
86
|
+
return ''
|
|
87
|
+
return re.sub(r'[^A-Z]', '', parts[0])
|
|
88
|
+
|
|
89
|
+
# Process WoS authors
|
|
90
|
+
wos_authors = [a.strip() for a in wos_authors.split(';') if a.strip()]
|
|
91
|
+
wos_dict = {get_author_key(author): author for author in wos_authors}
|
|
92
|
+
|
|
93
|
+
# Process Scopus authors
|
|
94
|
+
scopus_authors = [a.strip() for a in scopus_authors.split(';') if a.strip()]
|
|
95
|
+
|
|
96
|
+
# Result list (add WoS authors first)
|
|
97
|
+
merged_authors = wos_authors.copy()
|
|
98
|
+
|
|
99
|
+
# Add missing authors from Scopus
|
|
100
|
+
for scopus_author in scopus_authors:
|
|
101
|
+
author_key = get_author_key(scopus_author)
|
|
102
|
+
# Add if author not in WoS
|
|
103
|
+
if author_key not in wos_dict:
|
|
104
|
+
merged_authors.append(scopus_author)
|
|
105
|
+
|
|
106
|
+
return '; '.join(merged_authors)
|
|
107
|
+
|
|
108
|
+
def merge_author_fullnames(wos_af: str, scopus_af: str) -> str:
|
|
109
|
+
"""
|
|
110
|
+
Merges author full names using WoS format as reference
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
wos_af (str): Author full names from WoS
|
|
114
|
+
scopus_af (str): Author full names from Scopus
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
str: Merged author full names in WoS format
|
|
118
|
+
"""
|
|
119
|
+
def clean_author_name(author):
|
|
120
|
+
# Remove IDs in parentheses
|
|
121
|
+
author = re.sub(r'\s*\([^)]*\)', '', author)
|
|
122
|
+
# Clean spaces
|
|
123
|
+
author = re.sub(r'\s+', ' ', author.strip())
|
|
124
|
+
# Normalize special characters
|
|
125
|
+
author = unidecode(author)
|
|
126
|
+
return author
|
|
127
|
+
|
|
128
|
+
def get_author_key(author):
|
|
129
|
+
# Create key for author matching (LASTNAME, FIRSTNAME)
|
|
130
|
+
parts = clean_author_name(author).split(',', 1)
|
|
131
|
+
if len(parts) < 2:
|
|
132
|
+
return ''
|
|
133
|
+
lastname = parts[0].strip()
|
|
134
|
+
return lastname.upper()
|
|
135
|
+
|
|
136
|
+
# Process WoS authors
|
|
137
|
+
wos_authors = [a.strip() for a in wos_af.split(';') if a.strip()]
|
|
138
|
+
wos_dict = {get_author_key(author): author for author in wos_authors}
|
|
139
|
+
|
|
140
|
+
# Result list (start with WoS authors)
|
|
141
|
+
merged_authors = wos_authors.copy()
|
|
142
|
+
|
|
143
|
+
# Process Scopus authors
|
|
144
|
+
if scopus_af:
|
|
145
|
+
scopus_authors = [a.strip() for a in scopus_af.split(';') if a.strip()]
|
|
146
|
+
|
|
147
|
+
# Add missing authors from Scopus
|
|
148
|
+
for scopus_author in scopus_authors:
|
|
149
|
+
author_key = get_author_key(scopus_author)
|
|
150
|
+
# Add if author not in WoS
|
|
151
|
+
if author_key and author_key not in wos_dict:
|
|
152
|
+
clean_author = clean_author_name(scopus_author)
|
|
153
|
+
merged_authors.append(clean_author)
|
|
154
|
+
|
|
155
|
+
return ';'.join(merged_authors)
|
|
156
|
+
|
|
157
|
+
def merge_address_fields(wos_addresses: str, scopus_addresses: str) -> str:
|
|
158
|
+
"""
|
|
159
|
+
Merges C1 (Author Addresses) field.
|
|
160
|
+
Priority order:
|
|
161
|
+
1. Uses Scopus address if available
|
|
162
|
+
2. Uses WoS address if Scopus is empty
|
|
163
|
+
3. Returns empty string if both are empty
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
wos_addresses (str): Address information from WoS
|
|
167
|
+
scopus_addresses (str): Address information from Scopus
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
str: Selected address information
|
|
171
|
+
"""
|
|
172
|
+
try:
|
|
173
|
+
# Clean and check Scopus addresses
|
|
174
|
+
if pd.notna(scopus_addresses) and str(scopus_addresses).strip():
|
|
175
|
+
return str(scopus_addresses).strip()
|
|
176
|
+
|
|
177
|
+
# Clean and check WoS addresses
|
|
178
|
+
if pd.notna(wos_addresses) and str(wos_addresses).strip():
|
|
179
|
+
return str(wos_addresses).strip()
|
|
180
|
+
|
|
181
|
+
return ''
|
|
182
|
+
|
|
183
|
+
except Exception:
|
|
184
|
+
return ''
|
|
185
|
+
|
|
186
|
+
def merge_reprint_author(wos_rp: str, scopus_rp: str) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Merges reprint author information from WoS and Scopus.
|
|
189
|
+
Takes WoS data if available, otherwise takes Scopus data.
|
|
190
|
+
No formatting is applied.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
wos_rp (str): Reprint author from WoS
|
|
194
|
+
scopus_rp (str): Reprint author from Scopus
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
str: Original reprint author information from either source
|
|
198
|
+
"""
|
|
199
|
+
# If WoS data exists, use it
|
|
200
|
+
if pd.notna(wos_rp) and str(wos_rp).strip():
|
|
201
|
+
return str(wos_rp).strip()
|
|
202
|
+
|
|
203
|
+
# If only Scopus data exists, use it
|
|
204
|
+
if pd.notna(scopus_rp) and str(scopus_rp).strip():
|
|
205
|
+
return str(scopus_rp).strip()
|
|
206
|
+
|
|
207
|
+
# If neither exists, return empty string
|
|
208
|
+
return ''
|
|
209
|
+
|
|
210
|
+
def merge_references(wos_refs: str, scopus_refs: str) -> str:
|
|
211
|
+
"""
|
|
212
|
+
WoS ve Scopus referanslarını birleştirir.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
wos_refs (str): WoS'tan gelen referanslar (noktalı virgülle ayrılmış)
|
|
216
|
+
scopus_refs (str): Scopus'tan gelen referanslar (noktalı virgülle ayrılmış)
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
str: Birleştirilmiş ve temizlenmiş referanslar
|
|
220
|
+
"""
|
|
221
|
+
def split_and_clean_refs(refs_str):
|
|
222
|
+
if pd.isna(refs_str) or not refs_str:
|
|
223
|
+
return []
|
|
224
|
+
return [ref.strip() for ref in refs_str.split(';') if ref.strip()]
|
|
225
|
+
|
|
226
|
+
def clean_text(text):
|
|
227
|
+
# Tüm özel karakterleri kaldır (nokta, boşluk, virgül vb.)
|
|
228
|
+
# Sadece harf ve rakamları tut
|
|
229
|
+
return re.sub(r'[^A-Z0-9]', '', text.upper())
|
|
230
|
+
|
|
231
|
+
def create_ref_key(ref):
|
|
232
|
+
ref = ref.upper().strip()
|
|
233
|
+
|
|
234
|
+
# Scopus formatı için (sonda yıl parantez içinde)
|
|
235
|
+
if ref.endswith(')'):
|
|
236
|
+
year_match = re.search(r'\((\d{4})\)$', ref)
|
|
237
|
+
if year_match:
|
|
238
|
+
year = year_match.group(1)
|
|
239
|
+
# İlk virgüle kadar olan kısmı yazar olarak al ve temizle
|
|
240
|
+
author = clean_text(ref.split(',')[0])
|
|
241
|
+
return f"{author}_{year}"
|
|
242
|
+
|
|
243
|
+
# WoS formatı için
|
|
244
|
+
parts = ref.split(',')
|
|
245
|
+
if len(parts) >= 2:
|
|
246
|
+
author = clean_text(parts[0])
|
|
247
|
+
year = parts[1].strip()
|
|
248
|
+
# Yıl içindeki sayıları al
|
|
249
|
+
year_match = re.search(r'\d{4}', year)
|
|
250
|
+
if year_match:
|
|
251
|
+
year = year_match.group()
|
|
252
|
+
return f"{author}_{year}"
|
|
253
|
+
|
|
254
|
+
# Eğer format tanınmazsa, tüm metni temizle
|
|
255
|
+
return clean_text(ref)
|
|
256
|
+
|
|
257
|
+
# Referansları listelere ayır
|
|
258
|
+
wos_list = split_and_clean_refs(wos_refs)
|
|
259
|
+
scopus_list = split_and_clean_refs(scopus_refs)
|
|
260
|
+
|
|
261
|
+
# Her referans için anahtar oluştur
|
|
262
|
+
wos_dict = {create_ref_key(ref): ref for ref in wos_list}
|
|
263
|
+
scopus_dict = {create_ref_key(ref): ref for ref in scopus_list}
|
|
264
|
+
|
|
265
|
+
# Tüm benzersiz anahtarları al
|
|
266
|
+
all_keys = set(wos_dict.keys()) | set(scopus_dict.keys())
|
|
267
|
+
|
|
268
|
+
# Birleştirilmiş referansları oluştur
|
|
269
|
+
merged_refs = []
|
|
270
|
+
for key in all_keys:
|
|
271
|
+
# WoS formatını tercih et
|
|
272
|
+
if key in wos_dict:
|
|
273
|
+
merged_refs.append(wos_dict[key])
|
|
274
|
+
else:
|
|
275
|
+
merged_refs.append(scopus_dict[key])
|
|
276
|
+
|
|
277
|
+
# Referansları birleştir
|
|
278
|
+
return '; '.join(merged_refs)
|
|
279
|
+
|
|
280
|
+
def merge_abstracts(wos_ab: str, scopus_ab: str) -> str:
|
|
281
|
+
"""
|
|
282
|
+
Merges abstract information from WoS and Scopus
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
wos_ab (str): Abstract from WoS
|
|
286
|
+
scopus_ab (str): Abstract from Scopus
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
str: Merged abstract in enhanced format
|
|
290
|
+
"""
|
|
291
|
+
def clean_abstract(ab):
|
|
292
|
+
if pd.isna(ab) or not ab:
|
|
293
|
+
return ""
|
|
294
|
+
# Temizleme işlemleri
|
|
295
|
+
ab = re.sub(r'\s+', ' ', ab.strip())
|
|
296
|
+
# Copyright bilgisini kaldır
|
|
297
|
+
ab = re.sub(r'©.*?RESERVED\.?$', '', ab, flags=re.IGNORECASE)
|
|
298
|
+
return ab.strip()
|
|
299
|
+
|
|
300
|
+
# Her iki kaynaktan gelen abstract'leri temizle
|
|
301
|
+
wos_ab = clean_abstract(wos_ab)
|
|
302
|
+
scopus_ab = clean_abstract(scopus_ab)
|
|
303
|
+
|
|
304
|
+
# Eğer sadece bir kaynak varsa, onu kullan
|
|
305
|
+
if not wos_ab:
|
|
306
|
+
return scopus_ab
|
|
307
|
+
if not scopus_ab:
|
|
308
|
+
return wos_ab
|
|
309
|
+
|
|
310
|
+
# Her iki kaynak da varsa, daha uzun olanı tercih et
|
|
311
|
+
return wos_ab if len(wos_ab) > len(scopus_ab) else scopus_ab
|
|
312
|
+
|
|
313
|
+
def merge_keywords(wos_keywords: str, scopus_keywords: str) -> str:
|
|
314
|
+
"""
|
|
315
|
+
Merges author keywords from WoS and Scopus while normalizing special letters.
|
|
316
|
+
Preserves special characters and case, only normalizes language-specific letters.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
wos_keywords (str): Keywords from WoS
|
|
320
|
+
scopus_keywords (str): Keywords from Scopus
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
str: Merged keywords with duplicates removed
|
|
324
|
+
"""
|
|
325
|
+
def clean_keyword(kw):
|
|
326
|
+
if pd.isna(kw) or not kw:
|
|
327
|
+
return ""
|
|
328
|
+
# Remove extra spaces
|
|
329
|
+
kw = re.sub(r'\s+', ' ', kw.strip())
|
|
330
|
+
# Normalize special letters (é->e, ñ->n, etc.) while preserving case
|
|
331
|
+
kw = unidecode(kw)
|
|
332
|
+
return kw
|
|
333
|
+
|
|
334
|
+
# Split and clean keywords
|
|
335
|
+
wos_kws = [clean_keyword(kw) for kw in str(wos_keywords).split(';') if clean_keyword(kw)]
|
|
336
|
+
scopus_kws = [clean_keyword(kw) for kw in str(scopus_keywords).split(';') if clean_keyword(kw)]
|
|
337
|
+
|
|
338
|
+
# Create a case-insensitive set for duplicate checking
|
|
339
|
+
seen = set()
|
|
340
|
+
unique_keywords = []
|
|
341
|
+
|
|
342
|
+
# Process all keywords
|
|
343
|
+
for kw in wos_kws + scopus_kws:
|
|
344
|
+
# Use uppercase version for checking duplicates
|
|
345
|
+
kw_upper = kw.upper()
|
|
346
|
+
if kw_upper not in seen:
|
|
347
|
+
seen.add(kw_upper)
|
|
348
|
+
unique_keywords.append(kw)
|
|
349
|
+
|
|
350
|
+
# Sort alphabetically (case-insensitive) for consistency
|
|
351
|
+
unique_keywords.sort(key=str.upper)
|
|
352
|
+
|
|
353
|
+
return '; '.join(unique_keywords)
|
|
354
|
+
|
|
355
|
+
def merge_index_keywords(wos_keywords: str, scopus_keywords: str) -> str:
|
|
356
|
+
"""
|
|
357
|
+
Merges index keywords from WoS and Scopus while normalizing special letters.
|
|
358
|
+
Preserves special characters and case, only normalizes language-specific letters.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
wos_keywords (str): Keywords from WoS (Keywords Plus)
|
|
362
|
+
scopus_keywords (str): Keywords from Scopus (Index Keywords)
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
str: Merged keywords with duplicates removed
|
|
366
|
+
"""
|
|
367
|
+
def clean_keyword(kw):
|
|
368
|
+
if pd.isna(kw) or not kw:
|
|
369
|
+
return ""
|
|
370
|
+
# Remove extra spaces
|
|
371
|
+
kw = re.sub(r'\s+', ' ', kw.strip())
|
|
372
|
+
# Normalize special letters (é->e, ñ->n, etc.) while preserving case
|
|
373
|
+
kw = unidecode(kw)
|
|
374
|
+
return kw
|
|
375
|
+
|
|
376
|
+
# Split and clean keywords
|
|
377
|
+
wos_kws = [clean_keyword(kw) for kw in str(wos_keywords).split(';') if clean_keyword(kw)]
|
|
378
|
+
scopus_kws = [clean_keyword(kw) for kw in str(scopus_keywords).split(';') if clean_keyword(kw)]
|
|
379
|
+
|
|
380
|
+
# Create a case-insensitive set for duplicate checking
|
|
381
|
+
seen = set()
|
|
382
|
+
unique_keywords = []
|
|
383
|
+
|
|
384
|
+
# Process all keywords
|
|
385
|
+
for kw in wos_kws + scopus_kws:
|
|
386
|
+
# Use uppercase version for checking duplicates
|
|
387
|
+
kw_upper = kw.upper()
|
|
388
|
+
if kw_upper not in seen:
|
|
389
|
+
seen.add(kw_upper)
|
|
390
|
+
unique_keywords.append(kw)
|
|
391
|
+
|
|
392
|
+
# Sort alphabetically (case-insensitive) for consistency
|
|
393
|
+
unique_keywords.sort(key=str.upper)
|
|
394
|
+
|
|
395
|
+
return '; '.join(unique_keywords)
|
|
396
|
+
|
|
397
|
+
def merge_publisher(wos_pub: str, scopus_pub: str) -> str:
|
|
398
|
+
"""
|
|
399
|
+
Merges publisher information from WoS and Scopus, preferring the longer/full name.
|
|
400
|
+
Cleans and standardizes publisher names.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
wos_pub (str): Publisher from WoS
|
|
404
|
+
scopus_pub (str): Publisher from Scopus
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
str: Merged publisher name in standardized format
|
|
408
|
+
"""
|
|
409
|
+
def clean_publisher(pub):
|
|
410
|
+
if pd.isna(pub) or not pub:
|
|
411
|
+
return ""
|
|
412
|
+
# Remove extra spaces
|
|
413
|
+
pub = re.sub(r'\s+', ' ', pub.strip())
|
|
414
|
+
# Normalize special characters while preserving case
|
|
415
|
+
pub = unidecode(pub)
|
|
416
|
+
return pub
|
|
417
|
+
|
|
418
|
+
# Clean both publisher names
|
|
419
|
+
wos_pub = clean_publisher(wos_pub)
|
|
420
|
+
scopus_pub = clean_publisher(scopus_pub)
|
|
421
|
+
|
|
422
|
+
# If only one source has data, use that
|
|
423
|
+
if not wos_pub:
|
|
424
|
+
return scopus_pub
|
|
425
|
+
if not scopus_pub:
|
|
426
|
+
return wos_pub
|
|
427
|
+
|
|
428
|
+
# Compare lengths and use the longer name (usually more complete)
|
|
429
|
+
if len(scopus_pub) > len(wos_pub):
|
|
430
|
+
return scopus_pub
|
|
431
|
+
return wos_pub
|
|
432
|
+
|
|
433
|
+
def merge_language(wos_lang: str, scopus_lang: str) -> str:
|
|
434
|
+
"""
|
|
435
|
+
Merges language information from WoS and Scopus.
|
|
436
|
+
Standardizes language names and handles multiple languages.
|
|
437
|
+
If no language data is available, defaults to "ENGLISH".
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
wos_lang (str): Language from WoS
|
|
441
|
+
scopus_lang (str): Language from Scopus
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
str: Standardized language name(s)
|
|
445
|
+
"""
|
|
446
|
+
# Language code/name mapping
|
|
447
|
+
LANGUAGE_MAP = {
|
|
448
|
+
# Common codes
|
|
449
|
+
'ENG': 'ENGLISH',
|
|
450
|
+
'EN': 'ENGLISH',
|
|
451
|
+
'FRE': 'FRENCH',
|
|
452
|
+
'FR': 'FRENCH',
|
|
453
|
+
'GER': 'GERMAN',
|
|
454
|
+
'DE': 'GERMAN',
|
|
455
|
+
'SPA': 'SPANISH',
|
|
456
|
+
'ES': 'SPANISH',
|
|
457
|
+
'ITA': 'ITALIAN',
|
|
458
|
+
'IT': 'ITALIAN',
|
|
459
|
+
'POR': 'PORTUGUESE',
|
|
460
|
+
'PT': 'PORTUGUESE',
|
|
461
|
+
'RUS': 'RUSSIAN',
|
|
462
|
+
'RU': 'RUSSIAN',
|
|
463
|
+
'CHI': 'CHINESE',
|
|
464
|
+
'ZH': 'CHINESE',
|
|
465
|
+
'JPN': 'JAPANESE',
|
|
466
|
+
'JA': 'JAPANESE',
|
|
467
|
+
# Full names with special characters
|
|
468
|
+
'ESPANOL': 'SPANISH',
|
|
469
|
+
'ESPAÑOL': 'SPANISH',
|
|
470
|
+
'FRANCAIS': 'FRENCH',
|
|
471
|
+
'FRANÇAIS': 'FRENCH',
|
|
472
|
+
'DEUTSCHE': 'GERMAN',
|
|
473
|
+
'PORTUGUES': 'PORTUGUESE',
|
|
474
|
+
'PORTUGUÊS': 'PORTUGUESE',
|
|
475
|
+
'ITALIANO': 'ITALIAN',
|
|
476
|
+
'RUSSKIY': 'RUSSIAN',
|
|
477
|
+
'РУССКИЙ': 'RUSSIAN',
|
|
478
|
+
'中文': 'CHINESE',
|
|
479
|
+
'日本語': 'JAPANESE'
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
def standardize_language(lang):
|
|
483
|
+
if pd.isna(lang) or not lang:
|
|
484
|
+
return ""
|
|
485
|
+
# Clean and normalize
|
|
486
|
+
lang = re.sub(r'\s+', ' ', lang.strip())
|
|
487
|
+
lang = unidecode(lang).upper()
|
|
488
|
+
|
|
489
|
+
# Split if multiple languages
|
|
490
|
+
languages = [l.strip() for l in lang.split(';')]
|
|
491
|
+
|
|
492
|
+
# Standardize each language
|
|
493
|
+
standardized = []
|
|
494
|
+
for l in languages:
|
|
495
|
+
# Check if it's in our mapping
|
|
496
|
+
if l in LANGUAGE_MAP:
|
|
497
|
+
standardized.append(LANGUAGE_MAP[l])
|
|
498
|
+
else:
|
|
499
|
+
standardized.append(l)
|
|
500
|
+
|
|
501
|
+
return '; '.join(sorted(set(standardized)))
|
|
502
|
+
|
|
503
|
+
# Clean and standardize both inputs
|
|
504
|
+
wos_lang = standardize_language(wos_lang)
|
|
505
|
+
scopus_lang = standardize_language(scopus_lang)
|
|
506
|
+
|
|
507
|
+
# If both sources have data
|
|
508
|
+
if wos_lang and scopus_lang:
|
|
509
|
+
# Combine languages from both sources
|
|
510
|
+
all_langs = set(wos_lang.split('; ') + scopus_lang.split('; '))
|
|
511
|
+
return '; '.join(sorted(all_langs))
|
|
512
|
+
|
|
513
|
+
# If only one source has data
|
|
514
|
+
if wos_lang:
|
|
515
|
+
return wos_lang
|
|
516
|
+
if scopus_lang:
|
|
517
|
+
return scopus_lang
|
|
518
|
+
|
|
519
|
+
# If no language data is available, default to ENGLISH
|
|
520
|
+
return "ENGLISH"
|
|
521
|
+
|
|
522
|
+
def merge_document_type(wos_dt: str, scopus_dt: str) -> str:
|
|
523
|
+
"""
|
|
524
|
+
Merges document type information from WoS and Scopus.
|
|
525
|
+
Combines document types from both sources with semicolons.
|
|
526
|
+
Case-insensitive comparison to avoid duplicates.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
wos_dt (str): Document type from WoS
|
|
530
|
+
scopus_dt (str): Document type from Scopus
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
str: Combined document types from both sources
|
|
534
|
+
"""
|
|
535
|
+
def clean_doctype(dt):
|
|
536
|
+
if pd.isna(dt) or not dt:
|
|
537
|
+
return ""
|
|
538
|
+
# Remove extra spaces and convert to uppercase
|
|
539
|
+
dt = re.sub(r'\s+', ' ', str(dt).strip()).upper()
|
|
540
|
+
# Normalize special characters
|
|
541
|
+
dt = unidecode(dt)
|
|
542
|
+
# Remove any remaining special characters
|
|
543
|
+
dt = re.sub(r'[^A-Z0-9\s]', '', dt)
|
|
544
|
+
return dt
|
|
545
|
+
|
|
546
|
+
# Clean both inputs
|
|
547
|
+
wos_dt = clean_doctype(wos_dt)
|
|
548
|
+
scopus_dt = clean_doctype(scopus_dt)
|
|
549
|
+
|
|
550
|
+
# If both are empty, return empty string
|
|
551
|
+
if not wos_dt and not scopus_dt:
|
|
552
|
+
return ""
|
|
553
|
+
|
|
554
|
+
# If only one source has data, return that
|
|
555
|
+
if not wos_dt:
|
|
556
|
+
return scopus_dt
|
|
557
|
+
if not scopus_dt:
|
|
558
|
+
return wos_dt
|
|
559
|
+
|
|
560
|
+
# Split multiple document types if they exist
|
|
561
|
+
wos_types = set(dt.strip() for dt in wos_dt.split(';') if dt.strip())
|
|
562
|
+
scopus_types = set(dt.strip() for dt in scopus_dt.split(';') if dt.strip())
|
|
563
|
+
|
|
564
|
+
# Combine unique document types
|
|
565
|
+
all_types = wos_types | scopus_types
|
|
566
|
+
|
|
567
|
+
# Sort for consistency
|
|
568
|
+
return '; '.join(sorted(all_types))
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def merge_url(wos_url: str, scopus_url: str) -> str:
|
|
572
|
+
"""
|
|
573
|
+
Merges URL information from WoS and Scopus.
|
|
574
|
+
Prioritizes WoS URL if available, otherwise uses Scopus URL.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
wos_url (str): URL from WoS
|
|
578
|
+
scopus_url (str): URL from Scopus (Link)
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
str: URL, preferring WoS format when available
|
|
582
|
+
"""
|
|
583
|
+
def clean_url(url):
|
|
584
|
+
if pd.isna(url) or not url:
|
|
585
|
+
return ""
|
|
586
|
+
return str(url).strip()
|
|
587
|
+
|
|
588
|
+
# Clean both URLs
|
|
589
|
+
wos_url = clean_url(wos_url)
|
|
590
|
+
scopus_url = clean_url(scopus_url)
|
|
591
|
+
|
|
592
|
+
# Return WoS URL if available, otherwise Scopus URL
|
|
593
|
+
return wos_url if wos_url else scopus_url
|
|
594
|
+
|
|
595
|
+
def merge_open_access(wos_oa: str, scopus_oa: str) -> str:
|
|
596
|
+
"""
|
|
597
|
+
Merges Open Access information from WoS and Scopus.
|
|
598
|
+
Standardizes OA status and combines information from both sources.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
wos_oa (str): Open Access status from WoS
|
|
602
|
+
scopus_oa (str): Open Access status from Scopus
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
str: Standardized Open Access status
|
|
606
|
+
"""
|
|
607
|
+
# OA status mapping dictionary
|
|
608
|
+
OA_STATUS_MAP = {
|
|
609
|
+
# Common variations
|
|
610
|
+
'OPEN ACCESS': 'OPEN ACCESS',
|
|
611
|
+
'OA': 'OPEN ACCESS',
|
|
612
|
+
'GOLD': 'GOLD OPEN ACCESS',
|
|
613
|
+
'GOLD OPEN ACCESS': 'GOLD OPEN ACCESS',
|
|
614
|
+
'GREEN': 'GREEN OPEN ACCESS',
|
|
615
|
+
'GREEN OPEN ACCESS': 'GREEN OPEN ACCESS',
|
|
616
|
+
'BRONZE': 'BRONZE OPEN ACCESS',
|
|
617
|
+
'BRONZE OPEN ACCESS': 'BRONZE OPEN ACCESS',
|
|
618
|
+
'HYBRID': 'HYBRID OPEN ACCESS',
|
|
619
|
+
'HYBRID OPEN ACCESS': 'HYBRID OPEN ACCESS',
|
|
620
|
+
# Additional variations
|
|
621
|
+
'ALL OPEN ACCESS': 'OPEN ACCESS',
|
|
622
|
+
'PUBLISHED': 'OPEN ACCESS',
|
|
623
|
+
'FREE': 'OPEN ACCESS',
|
|
624
|
+
'PUBLISHERFULLGOLD': 'GOLD OPEN ACCESS',
|
|
625
|
+
'REPOSITORY': 'GREEN OPEN ACCESS',
|
|
626
|
+
# Non-OA variations
|
|
627
|
+
'SUBSCRIPTION': 'NON OPEN ACCESS',
|
|
628
|
+
'NON-OA': 'NON OPEN ACCESS',
|
|
629
|
+
'CLOSED': 'NON OPEN ACCESS'
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
def standardize_oa_status(oa):
|
|
633
|
+
if pd.isna(oa) or not oa:
|
|
634
|
+
return ""
|
|
635
|
+
# Remove extra spaces and convert to uppercase
|
|
636
|
+
oa = re.sub(r'\s+', ' ', str(oa).strip()).upper()
|
|
637
|
+
# Normalize special characters
|
|
638
|
+
oa = unidecode(oa)
|
|
639
|
+
# Map to standard status if exists
|
|
640
|
+
return OA_STATUS_MAP.get(oa, oa)
|
|
641
|
+
|
|
642
|
+
# Clean and standardize both inputs
|
|
643
|
+
wos_oa = standardize_oa_status(wos_oa)
|
|
644
|
+
scopus_oa = standardize_oa_status(scopus_oa)
|
|
645
|
+
|
|
646
|
+
# If only one source has data, use that
|
|
647
|
+
if not wos_oa and scopus_oa:
|
|
648
|
+
return scopus_oa
|
|
649
|
+
if wos_oa and not scopus_oa:
|
|
650
|
+
return wos_oa
|
|
651
|
+
|
|
652
|
+
# If both sources have data and they're different
|
|
653
|
+
if wos_oa and scopus_oa and wos_oa != scopus_oa:
|
|
654
|
+
# Prefer more specific OA type over general "OPEN ACCESS"
|
|
655
|
+
if wos_oa == 'OPEN ACCESS':
|
|
656
|
+
return scopus_oa
|
|
657
|
+
if scopus_oa == 'OPEN ACCESS':
|
|
658
|
+
return wos_oa
|
|
659
|
+
# If both have specific types, prefer WoS
|
|
660
|
+
return wos_oa
|
|
661
|
+
|
|
662
|
+
# If both are the same or empty
|
|
663
|
+
return wos_oa or 'NON OPEN ACCESS'
|
|
664
|
+
|
|
665
|
+
def clean_scopus_author_fullnames(df: pd.DataFrame) -> pd.DataFrame:
|
|
666
|
+
"""
|
|
667
|
+
Cleans Scopus author full names by removing IDs and converting to WoS format.
|
|
668
|
+
Example input: "CAO, NANNAN (58490132900)"
|
|
669
|
+
Example output: "CAO, NANNAN"
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
df (pd.DataFrame): DataFrame containing Scopus data
|
|
673
|
+
|
|
674
|
+
Returns:
|
|
675
|
+
pd.DataFrame: DataFrame with cleaned author full names
|
|
676
|
+
"""
|
|
677
|
+
if 'AF' not in df.columns:
|
|
678
|
+
return df
|
|
679
|
+
|
|
680
|
+
def clean_author(author_str):
|
|
681
|
+
if pd.isna(author_str) or not author_str:
|
|
682
|
+
return ""
|
|
683
|
+
|
|
684
|
+
# Split multiple authors
|
|
685
|
+
authors = [a.strip() for a in author_str.split(';')]
|
|
686
|
+
cleaned_authors = []
|
|
687
|
+
|
|
688
|
+
for author in authors:
|
|
689
|
+
# Remove ID in parentheses
|
|
690
|
+
author = re.sub(r'\s*\([^)]*\)', '', author)
|
|
691
|
+
# Clean extra spaces
|
|
692
|
+
author = re.sub(r'\s+', ' ', author.strip())
|
|
693
|
+
if author:
|
|
694
|
+
cleaned_authors.append(author)
|
|
695
|
+
|
|
696
|
+
return '; '.join(cleaned_authors)
|
|
697
|
+
|
|
698
|
+
df['AF'] = df['AF'].apply(clean_author)
|
|
699
|
+
return df
|
|
700
|
+
|
|
701
|
+
def merge_source_title(wos_so: str, scopus_so: str) -> str:
|
|
702
|
+
"""
|
|
703
|
+
Merges source title information from WoS and Scopus.
|
|
704
|
+
When both sources have data, prefers Scopus.
|
|
705
|
+
When only one source has data, uses that source.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
wos_so (str): Source title from WoS
|
|
709
|
+
scopus_so (str): Source title from Scopus
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
str: Source title, preferring Scopus when both exist
|
|
713
|
+
"""
|
|
714
|
+
def clean_title(title):
|
|
715
|
+
if pd.isna(title) or not title:
|
|
716
|
+
return ""
|
|
717
|
+
# Remove extra spaces
|
|
718
|
+
title = re.sub(r'\s+', ' ', str(title).strip())
|
|
719
|
+
return title
|
|
720
|
+
|
|
721
|
+
# Clean both titles
|
|
722
|
+
wos_so = clean_title(wos_so)
|
|
723
|
+
scopus_so = clean_title(scopus_so)
|
|
724
|
+
|
|
725
|
+
# If Scopus has data, use it
|
|
726
|
+
if scopus_so:
|
|
727
|
+
return scopus_so
|
|
728
|
+
# Otherwise use WoS if it has data
|
|
729
|
+
if wos_so:
|
|
730
|
+
return wos_so
|
|
731
|
+
# If neither has data, return empty string
|
|
732
|
+
return ""
|
|
733
|
+
|
|
734
|
+
def merge_db_sources(*dataframes: pd.DataFrame, remove_duplicated: bool = True, merge_fields: bool = True, verbose: bool = False) -> pd.DataFrame:
|
|
735
|
+
"""
|
|
736
|
+
Merges bibliometric data from different databases.
|
|
737
|
+
Combines information from different columns of the same records to create the most complete data.
|
|
738
|
+
|
|
739
|
+
Parameters:
|
|
740
|
+
-----------
|
|
741
|
+
*dataframes : pd.DataFrame
|
|
742
|
+
Bibliographic data frames to be merged
|
|
743
|
+
remove_duplicated : bool, default=True
|
|
744
|
+
If True, duplicate documents are removed
|
|
745
|
+
merge_fields : bool, default=True
|
|
746
|
+
If True, information from different columns of the same records is merged
|
|
747
|
+
verbose : bool, default=False
|
|
748
|
+
If True, prints information about duplicate documents
|
|
749
|
+
|
|
750
|
+
Returns:
|
|
751
|
+
--------
|
|
752
|
+
pd.DataFrame
|
|
753
|
+
Merged bibliographic data frame
|
|
754
|
+
"""
|
|
755
|
+
|
|
756
|
+
if not dataframes:
|
|
757
|
+
raise ValueError("At least one data frame is required!")
|
|
758
|
+
|
|
759
|
+
# Clean Scopus author full names before merging
|
|
760
|
+
cleaned_dataframes = []
|
|
761
|
+
for df in dataframes:
|
|
762
|
+
if 'DB' in df.columns:
|
|
763
|
+
# Create temporary RP columns based on source
|
|
764
|
+
if df['DB'].iloc[0] == 'SCOPUS' and 'RP' in df.columns:
|
|
765
|
+
df['RP_SCOPUS'] = df['RP']
|
|
766
|
+
df['RP_WOS'] = ''
|
|
767
|
+
elif df['DB'].iloc[0] == 'ISI' and 'RP' in df.columns:
|
|
768
|
+
df['RP_WOS'] = df['RP']
|
|
769
|
+
df['RP_SCOPUS'] = ''
|
|
770
|
+
|
|
771
|
+
if 'DB' in df.columns and df['DB'].iloc[0] == 'SCOPUS':
|
|
772
|
+
df = clean_scopus_author_fullnames(df)
|
|
773
|
+
cleaned_dataframes.append(df)
|
|
774
|
+
|
|
775
|
+
# Merge data frames
|
|
776
|
+
M = pd.concat(cleaned_dataframes, ignore_index=True)
|
|
777
|
+
initial_size = len(M)
|
|
778
|
+
|
|
779
|
+
# Create DB_Original column to track original source databases
|
|
780
|
+
M['DB_Original'] = M['DB']
|
|
781
|
+
|
|
782
|
+
if remove_duplicated:
|
|
783
|
+
if merge_fields:
|
|
784
|
+
# Group by DOI and select the most complete data within each group
|
|
785
|
+
if 'DI' in M.columns:
|
|
786
|
+
# Group records with DOI
|
|
787
|
+
grouped = M[~M['DI'].isna()].groupby('DI', as_index=False).agg(
|
|
788
|
+
lambda x: '; '.join(sorted(set(str(val) for val in x if pd.notna(val)))) if x.name == 'DB_Original'
|
|
789
|
+
else merge_values(x)
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
# Update DB field for merged records
|
|
793
|
+
grouped.loc[grouped['DB_Original'].str.contains(';'), 'DB'] = 'BIBEXPY'
|
|
794
|
+
|
|
795
|
+
# Add records without DOI
|
|
796
|
+
no_doi = M[M['DI'].isna()]
|
|
797
|
+
M = pd.concat([grouped, no_doi], ignore_index=True)
|
|
798
|
+
|
|
799
|
+
# Check duplicates by title and year
|
|
800
|
+
if 'TI' in M.columns and 'PY' in M.columns:
|
|
801
|
+
# Clean titles
|
|
802
|
+
M['clean_title'] = M['TI'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
|
|
803
|
+
M['clean_title'] = M['clean_title'].apply(trim)
|
|
804
|
+
|
|
805
|
+
# Group by title and year
|
|
806
|
+
M['title_year'] = M['clean_title'] + ' ' + M['PY'].astype(str)
|
|
807
|
+
|
|
808
|
+
# Select the most complete data for each group
|
|
809
|
+
grouped = M.groupby('title_year', as_index=False).agg(
|
|
810
|
+
lambda x: '; '.join(sorted(set(str(val) for val in x if pd.notna(val)))) if x.name == 'DB_Original'
|
|
811
|
+
else merge_values(x)
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
# Update DB field for merged records
|
|
815
|
+
grouped.loc[grouped['DB_Original'].str.contains(';'), 'DB'] = 'BIBEXPY'
|
|
816
|
+
|
|
817
|
+
M = grouped.drop(['title_year', 'clean_title'], axis=1)
|
|
818
|
+
else:
|
|
819
|
+
# Just remove duplicate records
|
|
820
|
+
if 'DI' in M.columns:
|
|
821
|
+
duplicates = M['DI'].duplicated() & ~M['DI'].isna()
|
|
822
|
+
M = M[~duplicates]
|
|
823
|
+
|
|
824
|
+
if 'TI' in M.columns and 'PY' in M.columns:
|
|
825
|
+
clean_titles = M['TI'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
|
|
826
|
+
clean_titles = clean_titles.apply(trim)
|
|
827
|
+
title_year = clean_titles + ' ' + M['PY'].astype(str)
|
|
828
|
+
duplicates = title_year.duplicated()
|
|
829
|
+
M = M[~duplicates]
|
|
830
|
+
|
|
831
|
+
# If there are multiple databases
|
|
832
|
+
if len(M['DB'].unique()) > 1:
|
|
833
|
+
# DB'yi ISI'ya set edelim
|
|
834
|
+
M['DB'] = 'ISI'
|
|
835
|
+
|
|
836
|
+
# Complete WC and SC fields from each other
|
|
837
|
+
if 'WC' in M.columns and 'SC' in M.columns:
|
|
838
|
+
# Fill WC from SC if WC is empty
|
|
839
|
+
M['WC'] = M.apply(lambda row: row['SC'] if (pd.isna(row['WC']) or str(row['WC']).strip() == '') and pd.notna(row['SC']) else row['WC'], axis=1)
|
|
840
|
+
# Fill SC from WC if SC is empty
|
|
841
|
+
M['SC'] = M.apply(lambda row: row['WC'] if (pd.isna(row['SC']) or str(row['SC']).strip() == '') and pd.notna(row['WC']) else row['SC'], axis=1)
|
|
842
|
+
|
|
843
|
+
# Merge RP data using temporary columns
|
|
844
|
+
if 'RP_WOS' in M.columns and 'RP_SCOPUS' in M.columns:
|
|
845
|
+
M['RP'] = M.apply(lambda row: row['RP_WOS'] if pd.notna(row['RP_WOS']) and str(row['RP_WOS']).strip()
|
|
846
|
+
else (row['RP_SCOPUS'] if pd.notna(row['RP_SCOPUS']) and str(row['RP_SCOPUS']).strip() else ''),
|
|
847
|
+
axis=1)
|
|
848
|
+
# Drop temporary columns
|
|
849
|
+
M = M.drop(['RP_WOS', 'RP_SCOPUS'], axis=1)
|
|
850
|
+
|
|
851
|
+
# Clean author data using new merge function
|
|
852
|
+
if 'AU' in M.columns:
|
|
853
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
854
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
855
|
+
|
|
856
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
857
|
+
for idx in M.index:
|
|
858
|
+
if pd.isna(M.at[idx, 'AU']):
|
|
859
|
+
continue
|
|
860
|
+
|
|
861
|
+
wos_authors = wos_data.at[idx, 'AU'] if idx in wos_data.index else ''
|
|
862
|
+
scopus_authors = scopus_data.at[idx, 'AU'] if idx in scopus_data.index else ''
|
|
863
|
+
|
|
864
|
+
if wos_authors and scopus_authors:
|
|
865
|
+
M.at[idx, 'AU'] = merge_author_fields(wos_authors, scopus_authors)
|
|
866
|
+
|
|
867
|
+
# Clean author full names using WoS format
|
|
868
|
+
if 'AF' in M.columns:
|
|
869
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
870
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
871
|
+
|
|
872
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
873
|
+
for idx in M.index:
|
|
874
|
+
if pd.isna(M.at[idx, 'AF']):
|
|
875
|
+
continue
|
|
876
|
+
|
|
877
|
+
wos_af = wos_data.at[idx, 'AF'] if idx in wos_data.index else ''
|
|
878
|
+
scopus_af = scopus_data.at[idx, 'AF'] if idx in scopus_data.index else ''
|
|
879
|
+
|
|
880
|
+
if wos_af and scopus_af:
|
|
881
|
+
M.at[idx, 'AF'] = merge_author_fullnames(wos_af, scopus_af)
|
|
882
|
+
|
|
883
|
+
# Use Scopus source title when available, otherwise use WoS
|
|
884
|
+
if 'SO' in M.columns:
|
|
885
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
886
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
887
|
+
|
|
888
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
889
|
+
for idx in M.index:
|
|
890
|
+
try:
|
|
891
|
+
# Get source titles from both sources using loc
|
|
892
|
+
wos_so = wos_data.loc[idx, 'SO'] if idx in wos_data.index else ''
|
|
893
|
+
scopus_so = scopus_data.loc[idx, 'SO'] if idx in scopus_data.index else ''
|
|
894
|
+
|
|
895
|
+
# Convert NaN to empty string
|
|
896
|
+
wos_so = '' if pd.isna(wos_so) else str(wos_so)
|
|
897
|
+
scopus_so = '' if pd.isna(scopus_so) else str(scopus_so)
|
|
898
|
+
|
|
899
|
+
# Merge source titles only if at least one source has data
|
|
900
|
+
if wos_so or scopus_so:
|
|
901
|
+
merged_so = merge_source_title(wos_so, scopus_so)
|
|
902
|
+
if merged_so:
|
|
903
|
+
M.loc[idx, 'SO'] = merged_so
|
|
904
|
+
except Exception as e:
|
|
905
|
+
print(f"Warning: Error merging source titles for index {idx}: {str(e)}")
|
|
906
|
+
# Use Scopus title if available, otherwise use WoS
|
|
907
|
+
if pd.notna(scopus_so):
|
|
908
|
+
M.loc[idx, 'SO'] = scopus_so
|
|
909
|
+
elif pd.notna(wos_so):
|
|
910
|
+
M.loc[idx, 'SO'] = wos_so
|
|
911
|
+
continue
|
|
912
|
+
|
|
913
|
+
# Use WoS journal abbreviation when available, otherwise use Scopus
|
|
914
|
+
if 'JI' in M.columns:
|
|
915
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
916
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
917
|
+
|
|
918
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
919
|
+
for idx in M.index:
|
|
920
|
+
if pd.isna(M.at[idx, 'JI']):
|
|
921
|
+
continue
|
|
922
|
+
|
|
923
|
+
wos_ji = wos_data.at[idx, 'JI'] if idx in wos_data.index else ''
|
|
924
|
+
scopus_ji = scopus_data.at[idx, 'JI'] if idx in scopus_data.index else ''
|
|
925
|
+
|
|
926
|
+
# Prefer WoS format if available
|
|
927
|
+
if wos_ji and not pd.isna(wos_ji):
|
|
928
|
+
M.at[idx, 'JI'] = wos_ji
|
|
929
|
+
elif scopus_ji and not pd.isna(scopus_ji):
|
|
930
|
+
M.at[idx, 'JI'] = scopus_ji
|
|
931
|
+
|
|
932
|
+
# Clean addresses using WoS format
|
|
933
|
+
if 'C1' in M.columns:
|
|
934
|
+
# Get WoS and Scopus data
|
|
935
|
+
wos_data = M[M['DB_Original'] == 'ISI'].copy()
|
|
936
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS'].copy()
|
|
937
|
+
|
|
938
|
+
# Initialize C1 column if not exists
|
|
939
|
+
if 'C1' not in M.columns:
|
|
940
|
+
M['C1'] = ''
|
|
941
|
+
|
|
942
|
+
# Process each row
|
|
943
|
+
for idx in M.index:
|
|
944
|
+
try:
|
|
945
|
+
# Get original database source
|
|
946
|
+
db_source = M.at[idx, 'DB_Original']
|
|
947
|
+
|
|
948
|
+
# Get addresses based on source
|
|
949
|
+
if db_source == 'SCOPUS':
|
|
950
|
+
current_address = scopus_data.at[idx, 'C1'] if idx in scopus_data.index else ''
|
|
951
|
+
elif db_source == 'ISI':
|
|
952
|
+
current_address = wos_data.at[idx, 'C1'] if idx in wos_data.index else ''
|
|
953
|
+
else:
|
|
954
|
+
current_address = ''
|
|
955
|
+
|
|
956
|
+
# Clean and set the address
|
|
957
|
+
if pd.notna(current_address) and str(current_address).strip():
|
|
958
|
+
M.at[idx, 'C1'] = str(current_address).strip()
|
|
959
|
+
|
|
960
|
+
except Exception:
|
|
961
|
+
continue
|
|
962
|
+
|
|
963
|
+
# Clean and merge abstracts
|
|
964
|
+
if 'AB' in M.columns:
|
|
965
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
966
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
967
|
+
|
|
968
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
969
|
+
for idx in M.index:
|
|
970
|
+
if pd.isna(M.at[idx, 'AB']):
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
wos_ab = wos_data.at[idx, 'AB'] if idx in wos_data.index else ''
|
|
974
|
+
scopus_ab = scopus_data.at[idx, 'AB'] if idx in scopus_data.index else ''
|
|
975
|
+
|
|
976
|
+
if wos_ab or scopus_ab:
|
|
977
|
+
M.at[idx, 'AB'] = merge_abstracts(wos_ab, scopus_ab)
|
|
978
|
+
|
|
979
|
+
# Clean and merge author keywords
|
|
980
|
+
if 'DE' in M.columns:
|
|
981
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
982
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
983
|
+
|
|
984
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
985
|
+
for idx in M.index:
|
|
986
|
+
if pd.isna(M.at[idx, 'DE']):
|
|
987
|
+
continue
|
|
988
|
+
|
|
989
|
+
wos_keywords = wos_data.at[idx, 'DE'] if idx in wos_data.index else ''
|
|
990
|
+
scopus_keywords = scopus_data.at[idx, 'DE'] if idx in scopus_data.index else ''
|
|
991
|
+
|
|
992
|
+
if wos_keywords or scopus_keywords:
|
|
993
|
+
M.at[idx, 'DE'] = merge_keywords(wos_keywords, scopus_keywords)
|
|
994
|
+
|
|
995
|
+
# Clean and merge index keywords
|
|
996
|
+
if 'ID' in M.columns:
|
|
997
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
998
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
999
|
+
|
|
1000
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1001
|
+
for idx in M.index:
|
|
1002
|
+
if pd.isna(M.at[idx, 'ID']):
|
|
1003
|
+
continue
|
|
1004
|
+
|
|
1005
|
+
wos_keywords = wos_data.at[idx, 'ID'] if idx in wos_data.index else ''
|
|
1006
|
+
scopus_keywords = scopus_data.at[idx, 'ID'] if idx in scopus_data.index else ''
|
|
1007
|
+
|
|
1008
|
+
if wos_keywords or scopus_keywords:
|
|
1009
|
+
M.at[idx, 'ID'] = merge_index_keywords(wos_keywords, scopus_keywords)
|
|
1010
|
+
|
|
1011
|
+
# Clean and merge references
|
|
1012
|
+
if 'CR' in M.columns:
|
|
1013
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1014
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1015
|
+
|
|
1016
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1017
|
+
for idx in M.index:
|
|
1018
|
+
try:
|
|
1019
|
+
# Get references from both sources
|
|
1020
|
+
wos_refs = wos_data.loc[wos_data.index == idx, 'CR'].iloc[0] if idx in wos_data.index else ''
|
|
1021
|
+
scopus_refs = scopus_data.loc[scopus_data.index == idx, 'CR'].iloc[0] if idx in scopus_data.index else ''
|
|
1022
|
+
|
|
1023
|
+
# Convert NaN to empty string
|
|
1024
|
+
wos_refs = '' if pd.isna(wos_refs) else str(wos_refs)
|
|
1025
|
+
scopus_refs = '' if pd.isna(scopus_refs) else str(scopus_refs)
|
|
1026
|
+
|
|
1027
|
+
# Merge references only if at least one source has data
|
|
1028
|
+
if wos_refs or scopus_refs:
|
|
1029
|
+
merged_refs = merge_references(wos_refs, scopus_refs)
|
|
1030
|
+
if merged_refs:
|
|
1031
|
+
M.at[idx, 'CR'] = merged_refs
|
|
1032
|
+
except Exception as e:
|
|
1033
|
+
print(f"Error merging references for index {idx}: {str(e)}")
|
|
1034
|
+
continue
|
|
1035
|
+
|
|
1036
|
+
# Clean and merge publisher names
|
|
1037
|
+
if 'PU' in M.columns:
|
|
1038
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1039
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1040
|
+
|
|
1041
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1042
|
+
for idx in M.index:
|
|
1043
|
+
if pd.isna(M.at[idx, 'PU']):
|
|
1044
|
+
continue
|
|
1045
|
+
|
|
1046
|
+
wos_pub = wos_data.at[idx, 'PU'] if idx in wos_data.index else ''
|
|
1047
|
+
scopus_pub = scopus_data.at[idx, 'PU'] if idx in scopus_data.index else ''
|
|
1048
|
+
|
|
1049
|
+
if wos_pub or scopus_pub:
|
|
1050
|
+
M.at[idx, 'PU'] = merge_publisher(wos_pub, scopus_pub)
|
|
1051
|
+
|
|
1052
|
+
# Clean and merge language information
|
|
1053
|
+
if 'LA' in M.columns:
|
|
1054
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1055
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1056
|
+
|
|
1057
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1058
|
+
for idx in M.index:
|
|
1059
|
+
wos_lang = wos_data.at[idx, 'LA'] if idx in wos_data.index else ''
|
|
1060
|
+
scopus_lang = scopus_data.at[idx, 'LA'] if idx in scopus_data.index else ''
|
|
1061
|
+
|
|
1062
|
+
M.at[idx, 'LA'] = merge_language(wos_lang, scopus_lang)
|
|
1063
|
+
|
|
1064
|
+
# Clean and merge document types
|
|
1065
|
+
if 'DT' in M.columns:
|
|
1066
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1067
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1068
|
+
|
|
1069
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1070
|
+
for idx in M.index:
|
|
1071
|
+
try:
|
|
1072
|
+
# Get document types from both sources using loc
|
|
1073
|
+
wos_dt = wos_data.loc[idx, 'DT'] if idx in wos_data.index else ''
|
|
1074
|
+
scopus_dt = scopus_data.loc[idx, 'DT'] if idx in scopus_data.index else ''
|
|
1075
|
+
|
|
1076
|
+
# Convert NaN to empty string
|
|
1077
|
+
wos_dt = '' if pd.isna(wos_dt) else str(wos_dt)
|
|
1078
|
+
scopus_dt = '' if pd.isna(scopus_dt) else str(scopus_dt)
|
|
1079
|
+
|
|
1080
|
+
# Merge document types only if at least one source has data
|
|
1081
|
+
if wos_dt or scopus_dt:
|
|
1082
|
+
merged_dt = merge_document_type(wos_dt, scopus_dt)
|
|
1083
|
+
if merged_dt:
|
|
1084
|
+
M.loc[idx, 'DT'] = merged_dt
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
print(f"Warning: Error merging document types for index {idx}: {str(e)}")
|
|
1087
|
+
# Use any available document type in case of error
|
|
1088
|
+
if pd.notna(wos_dt):
|
|
1089
|
+
M.loc[idx, 'DT'] = wos_dt
|
|
1090
|
+
elif pd.notna(scopus_dt):
|
|
1091
|
+
M.loc[idx, 'DT'] = scopus_dt
|
|
1092
|
+
continue
|
|
1093
|
+
|
|
1094
|
+
# Clean and merge unique identifiers
|
|
1095
|
+
if 'UT' in M.columns:
|
|
1096
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1097
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1098
|
+
|
|
1099
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1100
|
+
for idx in M.index:
|
|
1101
|
+
# WoS verisi varsa onu kullan
|
|
1102
|
+
if idx in wos_data.index and pd.notna(wos_data.at[idx, 'UT']):
|
|
1103
|
+
M.at[idx, 'UT'] = wos_data.at[idx, 'UT']
|
|
1104
|
+
# WoS verisi yoksa ve Scopus verisi varsa Scopus'u kullan
|
|
1105
|
+
elif idx in scopus_data.index and pd.notna(scopus_data.at[idx, 'UT']):
|
|
1106
|
+
M.at[idx, 'UT'] = scopus_data.at[idx, 'UT']
|
|
1107
|
+
|
|
1108
|
+
# Clean and merge URLs
|
|
1109
|
+
if 'URL' in M.columns:
|
|
1110
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1111
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1112
|
+
|
|
1113
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1114
|
+
for idx in M.index:
|
|
1115
|
+
wos_url = wos_data.at[idx, 'URL'] if idx in wos_data.index else ''
|
|
1116
|
+
scopus_url = scopus_data.at[idx, 'URL'] if idx in scopus_data.index else ''
|
|
1117
|
+
|
|
1118
|
+
M.at[idx, 'URL'] = merge_url(wos_url, scopus_url)
|
|
1119
|
+
|
|
1120
|
+
# Clean and merge Open Access status
|
|
1121
|
+
if 'OA' in M.columns:
|
|
1122
|
+
wos_data = M[M['DB_Original'] == 'ISI']
|
|
1123
|
+
scopus_data = M[M['DB_Original'] == 'SCOPUS']
|
|
1124
|
+
|
|
1125
|
+
if not wos_data.empty and not scopus_data.empty:
|
|
1126
|
+
for idx in M.index:
|
|
1127
|
+
wos_oa = wos_data.at[idx, 'OA'] if idx in wos_data.index else ''
|
|
1128
|
+
scopus_oa = scopus_data.at[idx, 'OA'] if idx in scopus_data.index else ''
|
|
1129
|
+
|
|
1130
|
+
M.at[idx, 'OA'] = merge_open_access(wos_oa, scopus_oa)
|
|
1131
|
+
|
|
1132
|
+
# Create SR tag
|
|
1133
|
+
M = meta_tag_extraction(M, 'SR')
|
|
1134
|
+
|
|
1135
|
+
return M
|
|
1136
|
+
|
|
1137
|
+
def main():
|
|
1138
|
+
try:
|
|
1139
|
+
print("Database Merge Tool")
|
|
1140
|
+
print("------------------")
|
|
1141
|
+
|
|
1142
|
+
# Find Excel files in rawData folder
|
|
1143
|
+
raw_data_path = "rawData"
|
|
1144
|
+
if not os.path.exists(raw_data_path):
|
|
1145
|
+
print("Error: rawData folder not found!")
|
|
1146
|
+
return
|
|
1147
|
+
|
|
1148
|
+
excel_files = [f for f in os.listdir(raw_data_path) if f.endswith('.xlsx')]
|
|
1149
|
+
|
|
1150
|
+
if not excel_files:
|
|
1151
|
+
print("Error: No Excel files found in rawData folder!")
|
|
1152
|
+
return
|
|
1153
|
+
|
|
1154
|
+
print(f"\nFound Excel files:")
|
|
1155
|
+
for i, file in enumerate(excel_files, 1):
|
|
1156
|
+
print(f"{i}. {file}")
|
|
1157
|
+
|
|
1158
|
+
dataframes = []
|
|
1159
|
+
for file in excel_files:
|
|
1160
|
+
try:
|
|
1161
|
+
file_path = os.path.join(raw_data_path, file)
|
|
1162
|
+
df = pd.read_excel(file_path)
|
|
1163
|
+
dataframes.append(df)
|
|
1164
|
+
print(f"\n{file} loaded successfully.")
|
|
1165
|
+
print(f"Record count: {len(df)}")
|
|
1166
|
+
except Exception as e:
|
|
1167
|
+
print(f"Error: Could not read file {file}: {str(e)}")
|
|
1168
|
+
|
|
1169
|
+
if not dataframes:
|
|
1170
|
+
print("\nNo files could be read!")
|
|
1171
|
+
return
|
|
1172
|
+
|
|
1173
|
+
print("\nMerging data...")
|
|
1174
|
+
merged_df = merge_db_sources(*dataframes, remove_duplicated=True, merge_fields=True, verbose=False)
|
|
1175
|
+
|
|
1176
|
+
# Save merged file
|
|
1177
|
+
output_file = "merged_data.xlsx"
|
|
1178
|
+
merged_df.to_excel(output_file, index=True) # SR will be used as index
|
|
1179
|
+
|
|
1180
|
+
print(f"\nMerged data saved to {output_file}")
|
|
1181
|
+
print(f"Total record count: {len(merged_df)}")
|
|
1182
|
+
|
|
1183
|
+
except Exception as e:
|
|
1184
|
+
print(f"\nAn unexpected error occurred: {str(e)}")
|
|
1185
|
+
finally:
|
|
1186
|
+
print("\nProgram terminating...")
|
|
1187
|
+
|
|
1188
|
+
if __name__ == "__main__":
|
|
1189
|
+
main()
|