bibexpy 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. bibex_core/MergeDB.py +1189 -0
  2. bibex_core/__init__.py +13 -0
  3. bibex_core/modules/__init__.py +35 -0
  4. bibex_core/modules/api_utils.py +1172 -0
  5. bibex_core/modules/c1_utils.py +423 -0
  6. bibex_core/modules/file_utils.py +71 -0
  7. bibex_core/modules/ml_utils.py +332 -0
  8. bibex_core/modules/post_process.py +50 -0
  9. bibex_core/modules/stats_utils.py +216 -0
  10. bibex_core/scp2xlsx.py +235 -0
  11. bibex_core/wos2xlsx.py +159 -0
  12. bibex_core/xlsx2vos.py +185 -0
  13. bibexpy/__init__.py +9 -0
  14. bibexpy/__main__.py +6 -0
  15. bibexpy/_server/__init__.py +1 -0
  16. bibexpy/_server/config.py +92 -0
  17. bibexpy/_server/jobs/__init__.py +3 -0
  18. bibexpy/_server/jobs/runner.py +301 -0
  19. bibexpy/_server/main.py +211 -0
  20. bibexpy/_server/models/__init__.py +0 -0
  21. bibexpy/_server/models/project.py +25 -0
  22. bibexpy/_server/routers/__init__.py +0 -0
  23. bibexpy/_server/routers/audit.py +65 -0
  24. bibexpy/_server/routers/convert.py +101 -0
  25. bibexpy/_server/routers/disambiguate.py +177 -0
  26. bibexpy/_server/routers/downloads.py +26 -0
  27. bibexpy/_server/routers/enrich.py +42 -0
  28. bibexpy/_server/routers/export.py +116 -0
  29. bibexpy/_server/routers/export_folder.py +97 -0
  30. bibexpy/_server/routers/filter.py +131 -0
  31. bibexpy/_server/routers/jobs.py +72 -0
  32. bibexpy/_server/routers/merge.py +437 -0
  33. bibexpy/_server/routers/prepare.py +241 -0
  34. bibexpy/_server/routers/projects.py +49 -0
  35. bibexpy/_server/routers/quality.py +288 -0
  36. bibexpy/_server/routers/records.py +204 -0
  37. bibexpy/_server/routers/report.py +111 -0
  38. bibexpy/_server/routers/settings.py +383 -0
  39. bibexpy/_server/routers/system.py +209 -0
  40. bibexpy/_server/routers/tools.py +257 -0
  41. bibexpy/_server/routers/upload.py +110 -0
  42. bibexpy/_server/services/__init__.py +0 -0
  43. bibexpy/_server/services/analyses.py +364 -0
  44. bibexpy/_server/services/audit.py +340 -0
  45. bibexpy/_server/services/bibex_adapter.py +47 -0
  46. bibexpy/_server/services/bibtex_writer.py +104 -0
  47. bibexpy/_server/services/converter.py +205 -0
  48. bibexpy/_server/services/disambiguation/__init__.py +19 -0
  49. bibexpy/_server/services/disambiguation/blocking.py +581 -0
  50. bibexpy/_server/services/disambiguation/cache.py +47 -0
  51. bibexpy/_server/services/disambiguation/deepseek_client.py +203 -0
  52. bibexpy/_server/services/disambiguation/orcid.py +261 -0
  53. bibexpy/_server/services/disambiguation/pipeline.py +985 -0
  54. bibexpy/_server/services/disambiguation/similarity.py +91 -0
  55. bibexpy/_server/services/enricher.py +338 -0
  56. bibexpy/_server/services/exporter.py +112 -0
  57. bibexpy/_server/services/filter_engine.py +300 -0
  58. bibexpy/_server/services/merger.py +202 -0
  59. bibexpy/_server/services/methodology.py +102 -0
  60. bibexpy/_server/services/report_export.py +300 -0
  61. bibexpy/_server/services/ris_writer.py +62 -0
  62. bibexpy/_server/services/smart_merger.py +967 -0
  63. bibexpy/_server/services/storage.py +165 -0
  64. bibexpy/_web/404/index.html +1 -0
  65. bibexpy/_web/404.html +1 -0
  66. bibexpy/_web/_next/static/chunks/117-7ad5186126fa076d.js +2 -0
  67. bibexpy/_web/_next/static/chunks/156-75beffb8ae92a7f3.js +1 -0
  68. bibexpy/_web/_next/static/chunks/169-301c04fcec22686f.js +4 -0
  69. bibexpy/_web/_next/static/chunks/2-ecc1c36a5632ac61.js +1 -0
  70. bibexpy/_web/_next/static/chunks/244-be343e0f8aa00fb5.js +1 -0
  71. bibexpy/_web/_next/static/chunks/367-b7457c2b95f6932b.js +1 -0
  72. bibexpy/_web/_next/static/chunks/382-9cd163eed632e241.js +1 -0
  73. bibexpy/_web/_next/static/chunks/392-afa8bfa4c1a9893d.js +1 -0
  74. bibexpy/_web/_next/static/chunks/507-63607ded9f83c1aa.js +1 -0
  75. bibexpy/_web/_next/static/chunks/624-cae565632f47217d.js +1 -0
  76. bibexpy/_web/_next/static/chunks/739-a00c6b9c0f11813e.js +1 -0
  77. bibexpy/_web/_next/static/chunks/862-8e1bd5f93c082c02.js +1 -0
  78. bibexpy/_web/_next/static/chunks/app/_not-found/page-873ba59b5eb7c342.js +1 -0
  79. bibexpy/_web/_next/static/chunks/app/layout-0d5fc68ac8971a04.js +1 -0
  80. bibexpy/_web/_next/static/chunks/app/page-cf2bea5edc74459a.js +1 -0
  81. bibexpy/_web/_next/static/chunks/app/projects/[id]/convert/page-ab6dd645ae113b64.js +1 -0
  82. bibexpy/_web/_next/static/chunks/app/projects/[id]/disambiguate/page-dd98482cc47344e9.js +1 -0
  83. bibexpy/_web/_next/static/chunks/app/projects/[id]/enrich/page-4727e431468514e9.js +1 -0
  84. bibexpy/_web/_next/static/chunks/app/projects/[id]/export/page-f9005d164bb22cb0.js +1 -0
  85. bibexpy/_web/_next/static/chunks/app/projects/[id]/layout-07755fda55b3ae1f.js +1 -0
  86. bibexpy/_web/_next/static/chunks/app/projects/[id]/merge/page-45b95318ea7beccf.js +1 -0
  87. bibexpy/_web/_next/static/chunks/app/projects/[id]/records/page-bfad80457da9eb0b.js +1 -0
  88. bibexpy/_web/_next/static/chunks/app/projects/[id]/report/page-f05767e74158e995.js +1 -0
  89. bibexpy/_web/_next/static/chunks/app/projects/[id]/upload/page-8df78ca9cdecbcb8.js +1 -0
  90. bibexpy/_web/_next/static/chunks/app/projects/page-99960711ac6a13e9.js +1 -0
  91. bibexpy/_web/_next/static/chunks/app/settings/page-002df809d6c5a227.js +1 -0
  92. bibexpy/_web/_next/static/chunks/app/tools/page-27eee1c6a590859f.js +1 -0
  93. bibexpy/_web/_next/static/chunks/fd9d1056-f2718be92181f426.js +1 -0
  94. bibexpy/_web/_next/static/chunks/framework-f66176bb897dc684.js +1 -0
  95. bibexpy/_web/_next/static/chunks/main-app-d138a6e2f8c847f6.js +1 -0
  96. bibexpy/_web/_next/static/chunks/main-c4b9ddbf514503ef.js +1 -0
  97. bibexpy/_web/_next/static/chunks/pages/_app-72b849fbd24ac258.js +1 -0
  98. bibexpy/_web/_next/static/chunks/pages/_error-7ba65e1336b92748.js +1 -0
  99. bibexpy/_web/_next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
  100. bibexpy/_web/_next/static/chunks/webpack-c81f7fd28659d64f.js +1 -0
  101. bibexpy/_web/_next/static/css/1c4638f3e39cf127.css +3 -0
  102. bibexpy/_web/_next/static/ypx2IQAkZiSaFiKWnzZGB/_buildManifest.js +1 -0
  103. bibexpy/_web/_next/static/ypx2IQAkZiSaFiKWnzZGB/_ssgManifest.js +1 -0
  104. bibexpy/_web/images/PROMPTS.md +83 -0
  105. bibexpy/_web/images/about-illustration.png +0 -0
  106. bibexpy/_web/images/authors/alperen-sahin.jpg +0 -0
  107. bibexpy/_web/images/authors/burak-can-kara.jpg +0 -0
  108. bibexpy/_web/images/authors/taskin-dirsehan.jpg +0 -0
  109. bibexpy/_web/images/bibexpy-logo-full.png +0 -0
  110. bibexpy/_web/images/bibexpy-logo-header.png +0 -0
  111. bibexpy/_web/images/bibexpy-logo-in.png +0 -0
  112. bibexpy/_web/images/bibexpy-logo.webp +0 -0
  113. bibexpy/_web/images/hero-illustration.png +0 -0
  114. bibexpy/_web/images/loader.gif +0 -0
  115. bibexpy/_web/images/workflow-diagram.png +0 -0
  116. bibexpy/_web/index.html +1 -0
  117. bibexpy/_web/index.txt +11 -0
  118. bibexpy/_web/projects/_/convert/index.html +1 -0
  119. bibexpy/_web/projects/_/convert/index.txt +12 -0
  120. bibexpy/_web/projects/_/disambiguate/index.html +1 -0
  121. bibexpy/_web/projects/_/disambiguate/index.txt +12 -0
  122. bibexpy/_web/projects/_/enrich/index.html +1 -0
  123. bibexpy/_web/projects/_/enrich/index.txt +12 -0
  124. bibexpy/_web/projects/_/export/index.html +1 -0
  125. bibexpy/_web/projects/_/export/index.txt +12 -0
  126. bibexpy/_web/projects/_/merge/index.html +1 -0
  127. bibexpy/_web/projects/_/merge/index.txt +12 -0
  128. bibexpy/_web/projects/_/records/index.html +1 -0
  129. bibexpy/_web/projects/_/records/index.txt +12 -0
  130. bibexpy/_web/projects/_/report/index.html +1 -0
  131. bibexpy/_web/projects/_/report/index.txt +12 -0
  132. bibexpy/_web/projects/_/upload/index.html +1 -0
  133. bibexpy/_web/projects/_/upload/index.txt +12 -0
  134. bibexpy/_web/projects/index.html +1 -0
  135. bibexpy/_web/projects/index.txt +11 -0
  136. bibexpy/_web/settings/index.html +1 -0
  137. bibexpy/_web/settings/index.txt +11 -0
  138. bibexpy/_web/tools/README.md +65 -0
  139. bibexpy/_web/tools/bibexcel.svg +6 -0
  140. bibexpy/_web/tools/bibliometrix.png +0 -0
  141. bibexpy/_web/tools/biblioshiny.svg +12 -0
  142. bibexpy/_web/tools/citavi.svg +6 -0
  143. bibexpy/_web/tools/citespace.svg +7 -0
  144. bibexpy/_web/tools/citnetexplorer.svg +8 -0
  145. bibexpy/_web/tools/endnote.svg +6 -0
  146. bibexpy/_web/tools/excel.svg +1 -0
  147. bibexpy/_web/tools/gephi.svg +66 -0
  148. bibexpy/_web/tools/histcite.svg +7 -0
  149. bibexpy/_web/tools/index.html +1 -0
  150. bibexpy/_web/tools/index.txt +11 -0
  151. bibexpy/_web/tools/jabref.svg +122 -0
  152. bibexpy/_web/tools/latex.svg +1 -0
  153. bibexpy/_web/tools/mendeley.svg +1 -0
  154. bibexpy/_web/tools/openrefine.svg +15 -0
  155. bibexpy/_web/tools/overleaf.svg +1 -0
  156. bibexpy/_web/tools/papers.svg +7 -0
  157. bibexpy/_web/tools/powerbi.svg +1 -0
  158. bibexpy/_web/tools/python.svg +1 -0
  159. bibexpy/_web/tools/r.svg +1 -0
  160. bibexpy/_web/tools/refworks.svg +5 -0
  161. bibexpy/_web/tools/scite.svg +5 -0
  162. bibexpy/_web/tools/tableau.svg +1 -0
  163. bibexpy/_web/tools/vosviewer.png +0 -0
  164. bibexpy/_web/tools/zotero.svg +1 -0
  165. bibexpy/cli.py +242 -0
  166. bibexpy-2.0.0.dist-info/METADATA +105 -0
  167. bibexpy-2.0.0.dist-info/RECORD +171 -0
  168. bibexpy-2.0.0.dist-info/WHEEL +5 -0
  169. bibexpy-2.0.0.dist-info/entry_points.txt +2 -0
  170. bibexpy-2.0.0.dist-info/licenses/LICENSE +674 -0
  171. bibexpy-2.0.0.dist-info/top_level.txt +2 -0
bibex_core/MergeDB.py ADDED
@@ -0,0 +1,1189 @@
1
+ import pandas as pd
2
+ import re
3
+ import os
4
+ from typing import List, Union
5
+ import numpy as np
6
+ from unidecode import unidecode
7
+
8
+ def trim(text: str) -> str:
9
+ """Removes extra spaces from text"""
10
+ if pd.isna(text):
11
+ return ""
12
+ return re.sub(r'\s+', ' ', str(text)).strip()
13
+
14
+ def merge_values(x):
15
+ """
16
+ Merges values from a pandas Series, handling NaN values and duplicates.
17
+ Used for combining values during database merging.
18
+ """
19
+ if x.empty:
20
+ return ""
21
+ # Get first non-NaN value
22
+ values = [str(val) for val in x if pd.notna(val)]
23
+ if not values:
24
+ return ""
25
+ return values[0]
26
+
27
+ def meta_tag_extraction(df: pd.DataFrame, tag: str) -> pd.DataFrame:
28
+ """Creates SR (Source) tag"""
29
+ if 'AU' in df.columns and 'PY' in df.columns:
30
+ def _sr(row):
31
+ au = row['AU']
32
+ py = row['PY']
33
+ # NaN / None / boş AU varsa güvenli fallback
34
+ if pd.isna(au) or au is None:
35
+ au_first = ""
36
+ else:
37
+ au_str = str(au)
38
+ au_first = au_str.split(';')[0].strip() if au_str else ""
39
+ py_str = "" if pd.isna(py) else str(py)
40
+ return f"{au_first} {py_str}".strip()
41
+ df['SR'] = df.apply(_sr, axis=1)
42
+ return df
43
+
44
+ def clean_merged_values(x: str) -> str:
45
+ """Clean merged values by removing extra semicolons, spaces and duplicates"""
46
+ if not isinstance(x, str):
47
+ return x
48
+
49
+ # Split by semicolon and clean each part
50
+ parts = [part.strip() for part in x.split(';')]
51
+
52
+ # Remove empty parts and duplicates while preserving order
53
+ seen = set()
54
+ cleaned_parts = []
55
+ for part in parts:
56
+ if part and part not in seen:
57
+ seen.add(part)
58
+ cleaned_parts.append(part)
59
+
60
+ # Join back with semicolon
61
+ return '; '.join(cleaned_parts)
62
+
63
+ def merge_author_fields(wos_authors: str, scopus_authors: str) -> str:
64
+ """
65
+ Merges author fields using WoS format as reference
66
+
67
+ Args:
68
+ wos_authors (str): Author list from WoS
69
+ scopus_authors (str): Author list from Scopus
70
+
71
+ Returns:
72
+ str: Merged author list in WoS format
73
+ """
74
+ def normalize_author(author):
75
+ # Clean spaces
76
+ author = re.sub(r'\s+', ' ', author.strip())
77
+ # Normalize special characters
78
+ author = unidecode(author)
79
+ # Convert to uppercase
80
+ return author.upper()
81
+
82
+ def get_author_key(author):
83
+ # Create key for author matching
84
+ parts = normalize_author(author).split()
85
+ if not parts:
86
+ return ''
87
+ return re.sub(r'[^A-Z]', '', parts[0])
88
+
89
+ # Process WoS authors
90
+ wos_authors = [a.strip() for a in wos_authors.split(';') if a.strip()]
91
+ wos_dict = {get_author_key(author): author for author in wos_authors}
92
+
93
+ # Process Scopus authors
94
+ scopus_authors = [a.strip() for a in scopus_authors.split(';') if a.strip()]
95
+
96
+ # Result list (add WoS authors first)
97
+ merged_authors = wos_authors.copy()
98
+
99
+ # Add missing authors from Scopus
100
+ for scopus_author in scopus_authors:
101
+ author_key = get_author_key(scopus_author)
102
+ # Add if author not in WoS
103
+ if author_key not in wos_dict:
104
+ merged_authors.append(scopus_author)
105
+
106
+ return '; '.join(merged_authors)
107
+
108
+ def merge_author_fullnames(wos_af: str, scopus_af: str) -> str:
109
+ """
110
+ Merges author full names using WoS format as reference
111
+
112
+ Args:
113
+ wos_af (str): Author full names from WoS
114
+ scopus_af (str): Author full names from Scopus
115
+
116
+ Returns:
117
+ str: Merged author full names in WoS format
118
+ """
119
+ def clean_author_name(author):
120
+ # Remove IDs in parentheses
121
+ author = re.sub(r'\s*\([^)]*\)', '', author)
122
+ # Clean spaces
123
+ author = re.sub(r'\s+', ' ', author.strip())
124
+ # Normalize special characters
125
+ author = unidecode(author)
126
+ return author
127
+
128
+ def get_author_key(author):
129
+ # Create key for author matching (LASTNAME, FIRSTNAME)
130
+ parts = clean_author_name(author).split(',', 1)
131
+ if len(parts) < 2:
132
+ return ''
133
+ lastname = parts[0].strip()
134
+ return lastname.upper()
135
+
136
+ # Process WoS authors
137
+ wos_authors = [a.strip() for a in wos_af.split(';') if a.strip()]
138
+ wos_dict = {get_author_key(author): author for author in wos_authors}
139
+
140
+ # Result list (start with WoS authors)
141
+ merged_authors = wos_authors.copy()
142
+
143
+ # Process Scopus authors
144
+ if scopus_af:
145
+ scopus_authors = [a.strip() for a in scopus_af.split(';') if a.strip()]
146
+
147
+ # Add missing authors from Scopus
148
+ for scopus_author in scopus_authors:
149
+ author_key = get_author_key(scopus_author)
150
+ # Add if author not in WoS
151
+ if author_key and author_key not in wos_dict:
152
+ clean_author = clean_author_name(scopus_author)
153
+ merged_authors.append(clean_author)
154
+
155
+ return ';'.join(merged_authors)
156
+
157
+ def merge_address_fields(wos_addresses: str, scopus_addresses: str) -> str:
158
+ """
159
+ Merges C1 (Author Addresses) field.
160
+ Priority order:
161
+ 1. Uses Scopus address if available
162
+ 2. Uses WoS address if Scopus is empty
163
+ 3. Returns empty string if both are empty
164
+
165
+ Args:
166
+ wos_addresses (str): Address information from WoS
167
+ scopus_addresses (str): Address information from Scopus
168
+
169
+ Returns:
170
+ str: Selected address information
171
+ """
172
+ try:
173
+ # Clean and check Scopus addresses
174
+ if pd.notna(scopus_addresses) and str(scopus_addresses).strip():
175
+ return str(scopus_addresses).strip()
176
+
177
+ # Clean and check WoS addresses
178
+ if pd.notna(wos_addresses) and str(wos_addresses).strip():
179
+ return str(wos_addresses).strip()
180
+
181
+ return ''
182
+
183
+ except Exception:
184
+ return ''
185
+
186
+ def merge_reprint_author(wos_rp: str, scopus_rp: str) -> str:
187
+ """
188
+ Merges reprint author information from WoS and Scopus.
189
+ Takes WoS data if available, otherwise takes Scopus data.
190
+ No formatting is applied.
191
+
192
+ Args:
193
+ wos_rp (str): Reprint author from WoS
194
+ scopus_rp (str): Reprint author from Scopus
195
+
196
+ Returns:
197
+ str: Original reprint author information from either source
198
+ """
199
+ # If WoS data exists, use it
200
+ if pd.notna(wos_rp) and str(wos_rp).strip():
201
+ return str(wos_rp).strip()
202
+
203
+ # If only Scopus data exists, use it
204
+ if pd.notna(scopus_rp) and str(scopus_rp).strip():
205
+ return str(scopus_rp).strip()
206
+
207
+ # If neither exists, return empty string
208
+ return ''
209
+
210
+ def merge_references(wos_refs: str, scopus_refs: str) -> str:
211
+ """
212
+ WoS ve Scopus referanslarını birleştirir.
213
+
214
+ Args:
215
+ wos_refs (str): WoS'tan gelen referanslar (noktalı virgülle ayrılmış)
216
+ scopus_refs (str): Scopus'tan gelen referanslar (noktalı virgülle ayrılmış)
217
+
218
+ Returns:
219
+ str: Birleştirilmiş ve temizlenmiş referanslar
220
+ """
221
+ def split_and_clean_refs(refs_str):
222
+ if pd.isna(refs_str) or not refs_str:
223
+ return []
224
+ return [ref.strip() for ref in refs_str.split(';') if ref.strip()]
225
+
226
+ def clean_text(text):
227
+ # Tüm özel karakterleri kaldır (nokta, boşluk, virgül vb.)
228
+ # Sadece harf ve rakamları tut
229
+ return re.sub(r'[^A-Z0-9]', '', text.upper())
230
+
231
+ def create_ref_key(ref):
232
+ ref = ref.upper().strip()
233
+
234
+ # Scopus formatı için (sonda yıl parantez içinde)
235
+ if ref.endswith(')'):
236
+ year_match = re.search(r'\((\d{4})\)$', ref)
237
+ if year_match:
238
+ year = year_match.group(1)
239
+ # İlk virgüle kadar olan kısmı yazar olarak al ve temizle
240
+ author = clean_text(ref.split(',')[0])
241
+ return f"{author}_{year}"
242
+
243
+ # WoS formatı için
244
+ parts = ref.split(',')
245
+ if len(parts) >= 2:
246
+ author = clean_text(parts[0])
247
+ year = parts[1].strip()
248
+ # Yıl içindeki sayıları al
249
+ year_match = re.search(r'\d{4}', year)
250
+ if year_match:
251
+ year = year_match.group()
252
+ return f"{author}_{year}"
253
+
254
+ # Eğer format tanınmazsa, tüm metni temizle
255
+ return clean_text(ref)
256
+
257
+ # Referansları listelere ayır
258
+ wos_list = split_and_clean_refs(wos_refs)
259
+ scopus_list = split_and_clean_refs(scopus_refs)
260
+
261
+ # Her referans için anahtar oluştur
262
+ wos_dict = {create_ref_key(ref): ref for ref in wos_list}
263
+ scopus_dict = {create_ref_key(ref): ref for ref in scopus_list}
264
+
265
+ # Tüm benzersiz anahtarları al
266
+ all_keys = set(wos_dict.keys()) | set(scopus_dict.keys())
267
+
268
+ # Birleştirilmiş referansları oluştur
269
+ merged_refs = []
270
+ for key in all_keys:
271
+ # WoS formatını tercih et
272
+ if key in wos_dict:
273
+ merged_refs.append(wos_dict[key])
274
+ else:
275
+ merged_refs.append(scopus_dict[key])
276
+
277
+ # Referansları birleştir
278
+ return '; '.join(merged_refs)
279
+
280
+ def merge_abstracts(wos_ab: str, scopus_ab: str) -> str:
281
+ """
282
+ Merges abstract information from WoS and Scopus
283
+
284
+ Args:
285
+ wos_ab (str): Abstract from WoS
286
+ scopus_ab (str): Abstract from Scopus
287
+
288
+ Returns:
289
+ str: Merged abstract in enhanced format
290
+ """
291
+ def clean_abstract(ab):
292
+ if pd.isna(ab) or not ab:
293
+ return ""
294
+ # Temizleme işlemleri
295
+ ab = re.sub(r'\s+', ' ', ab.strip())
296
+ # Copyright bilgisini kaldır
297
+ ab = re.sub(r'©.*?RESERVED\.?$', '', ab, flags=re.IGNORECASE)
298
+ return ab.strip()
299
+
300
+ # Her iki kaynaktan gelen abstract'leri temizle
301
+ wos_ab = clean_abstract(wos_ab)
302
+ scopus_ab = clean_abstract(scopus_ab)
303
+
304
+ # Eğer sadece bir kaynak varsa, onu kullan
305
+ if not wos_ab:
306
+ return scopus_ab
307
+ if not scopus_ab:
308
+ return wos_ab
309
+
310
+ # Her iki kaynak da varsa, daha uzun olanı tercih et
311
+ return wos_ab if len(wos_ab) > len(scopus_ab) else scopus_ab
312
+
313
+ def merge_keywords(wos_keywords: str, scopus_keywords: str) -> str:
314
+ """
315
+ Merges author keywords from WoS and Scopus while normalizing special letters.
316
+ Preserves special characters and case, only normalizes language-specific letters.
317
+
318
+ Args:
319
+ wos_keywords (str): Keywords from WoS
320
+ scopus_keywords (str): Keywords from Scopus
321
+
322
+ Returns:
323
+ str: Merged keywords with duplicates removed
324
+ """
325
+ def clean_keyword(kw):
326
+ if pd.isna(kw) or not kw:
327
+ return ""
328
+ # Remove extra spaces
329
+ kw = re.sub(r'\s+', ' ', kw.strip())
330
+ # Normalize special letters (é->e, ñ->n, etc.) while preserving case
331
+ kw = unidecode(kw)
332
+ return kw
333
+
334
+ # Split and clean keywords
335
+ wos_kws = [clean_keyword(kw) for kw in str(wos_keywords).split(';') if clean_keyword(kw)]
336
+ scopus_kws = [clean_keyword(kw) for kw in str(scopus_keywords).split(';') if clean_keyword(kw)]
337
+
338
+ # Create a case-insensitive set for duplicate checking
339
+ seen = set()
340
+ unique_keywords = []
341
+
342
+ # Process all keywords
343
+ for kw in wos_kws + scopus_kws:
344
+ # Use uppercase version for checking duplicates
345
+ kw_upper = kw.upper()
346
+ if kw_upper not in seen:
347
+ seen.add(kw_upper)
348
+ unique_keywords.append(kw)
349
+
350
+ # Sort alphabetically (case-insensitive) for consistency
351
+ unique_keywords.sort(key=str.upper)
352
+
353
+ return '; '.join(unique_keywords)
354
+
355
+ def merge_index_keywords(wos_keywords: str, scopus_keywords: str) -> str:
356
+ """
357
+ Merges index keywords from WoS and Scopus while normalizing special letters.
358
+ Preserves special characters and case, only normalizes language-specific letters.
359
+
360
+ Args:
361
+ wos_keywords (str): Keywords from WoS (Keywords Plus)
362
+ scopus_keywords (str): Keywords from Scopus (Index Keywords)
363
+
364
+ Returns:
365
+ str: Merged keywords with duplicates removed
366
+ """
367
+ def clean_keyword(kw):
368
+ if pd.isna(kw) or not kw:
369
+ return ""
370
+ # Remove extra spaces
371
+ kw = re.sub(r'\s+', ' ', kw.strip())
372
+ # Normalize special letters (é->e, ñ->n, etc.) while preserving case
373
+ kw = unidecode(kw)
374
+ return kw
375
+
376
+ # Split and clean keywords
377
+ wos_kws = [clean_keyword(kw) for kw in str(wos_keywords).split(';') if clean_keyword(kw)]
378
+ scopus_kws = [clean_keyword(kw) for kw in str(scopus_keywords).split(';') if clean_keyword(kw)]
379
+
380
+ # Create a case-insensitive set for duplicate checking
381
+ seen = set()
382
+ unique_keywords = []
383
+
384
+ # Process all keywords
385
+ for kw in wos_kws + scopus_kws:
386
+ # Use uppercase version for checking duplicates
387
+ kw_upper = kw.upper()
388
+ if kw_upper not in seen:
389
+ seen.add(kw_upper)
390
+ unique_keywords.append(kw)
391
+
392
+ # Sort alphabetically (case-insensitive) for consistency
393
+ unique_keywords.sort(key=str.upper)
394
+
395
+ return '; '.join(unique_keywords)
396
+
397
+ def merge_publisher(wos_pub: str, scopus_pub: str) -> str:
398
+ """
399
+ Merges publisher information from WoS and Scopus, preferring the longer/full name.
400
+ Cleans and standardizes publisher names.
401
+
402
+ Args:
403
+ wos_pub (str): Publisher from WoS
404
+ scopus_pub (str): Publisher from Scopus
405
+
406
+ Returns:
407
+ str: Merged publisher name in standardized format
408
+ """
409
+ def clean_publisher(pub):
410
+ if pd.isna(pub) or not pub:
411
+ return ""
412
+ # Remove extra spaces
413
+ pub = re.sub(r'\s+', ' ', pub.strip())
414
+ # Normalize special characters while preserving case
415
+ pub = unidecode(pub)
416
+ return pub
417
+
418
+ # Clean both publisher names
419
+ wos_pub = clean_publisher(wos_pub)
420
+ scopus_pub = clean_publisher(scopus_pub)
421
+
422
+ # If only one source has data, use that
423
+ if not wos_pub:
424
+ return scopus_pub
425
+ if not scopus_pub:
426
+ return wos_pub
427
+
428
+ # Compare lengths and use the longer name (usually more complete)
429
+ if len(scopus_pub) > len(wos_pub):
430
+ return scopus_pub
431
+ return wos_pub
432
+
433
+ def merge_language(wos_lang: str, scopus_lang: str) -> str:
434
+ """
435
+ Merges language information from WoS and Scopus.
436
+ Standardizes language names and handles multiple languages.
437
+ If no language data is available, defaults to "ENGLISH".
438
+
439
+ Args:
440
+ wos_lang (str): Language from WoS
441
+ scopus_lang (str): Language from Scopus
442
+
443
+ Returns:
444
+ str: Standardized language name(s)
445
+ """
446
+ # Language code/name mapping
447
+ LANGUAGE_MAP = {
448
+ # Common codes
449
+ 'ENG': 'ENGLISH',
450
+ 'EN': 'ENGLISH',
451
+ 'FRE': 'FRENCH',
452
+ 'FR': 'FRENCH',
453
+ 'GER': 'GERMAN',
454
+ 'DE': 'GERMAN',
455
+ 'SPA': 'SPANISH',
456
+ 'ES': 'SPANISH',
457
+ 'ITA': 'ITALIAN',
458
+ 'IT': 'ITALIAN',
459
+ 'POR': 'PORTUGUESE',
460
+ 'PT': 'PORTUGUESE',
461
+ 'RUS': 'RUSSIAN',
462
+ 'RU': 'RUSSIAN',
463
+ 'CHI': 'CHINESE',
464
+ 'ZH': 'CHINESE',
465
+ 'JPN': 'JAPANESE',
466
+ 'JA': 'JAPANESE',
467
+ # Full names with special characters
468
+ 'ESPANOL': 'SPANISH',
469
+ 'ESPAÑOL': 'SPANISH',
470
+ 'FRANCAIS': 'FRENCH',
471
+ 'FRANÇAIS': 'FRENCH',
472
+ 'DEUTSCHE': 'GERMAN',
473
+ 'PORTUGUES': 'PORTUGUESE',
474
+ 'PORTUGUÊS': 'PORTUGUESE',
475
+ 'ITALIANO': 'ITALIAN',
476
+ 'RUSSKIY': 'RUSSIAN',
477
+ 'РУССКИЙ': 'RUSSIAN',
478
+ '中文': 'CHINESE',
479
+ '日本語': 'JAPANESE'
480
+ }
481
+
482
+ def standardize_language(lang):
483
+ if pd.isna(lang) or not lang:
484
+ return ""
485
+ # Clean and normalize
486
+ lang = re.sub(r'\s+', ' ', lang.strip())
487
+ lang = unidecode(lang).upper()
488
+
489
+ # Split if multiple languages
490
+ languages = [l.strip() for l in lang.split(';')]
491
+
492
+ # Standardize each language
493
+ standardized = []
494
+ for l in languages:
495
+ # Check if it's in our mapping
496
+ if l in LANGUAGE_MAP:
497
+ standardized.append(LANGUAGE_MAP[l])
498
+ else:
499
+ standardized.append(l)
500
+
501
+ return '; '.join(sorted(set(standardized)))
502
+
503
+ # Clean and standardize both inputs
504
+ wos_lang = standardize_language(wos_lang)
505
+ scopus_lang = standardize_language(scopus_lang)
506
+
507
+ # If both sources have data
508
+ if wos_lang and scopus_lang:
509
+ # Combine languages from both sources
510
+ all_langs = set(wos_lang.split('; ') + scopus_lang.split('; '))
511
+ return '; '.join(sorted(all_langs))
512
+
513
+ # If only one source has data
514
+ if wos_lang:
515
+ return wos_lang
516
+ if scopus_lang:
517
+ return scopus_lang
518
+
519
+ # If no language data is available, default to ENGLISH
520
+ return "ENGLISH"
521
+
522
+ def merge_document_type(wos_dt: str, scopus_dt: str) -> str:
523
+ """
524
+ Merges document type information from WoS and Scopus.
525
+ Combines document types from both sources with semicolons.
526
+ Case-insensitive comparison to avoid duplicates.
527
+
528
+ Args:
529
+ wos_dt (str): Document type from WoS
530
+ scopus_dt (str): Document type from Scopus
531
+
532
+ Returns:
533
+ str: Combined document types from both sources
534
+ """
535
+ def clean_doctype(dt):
536
+ if pd.isna(dt) or not dt:
537
+ return ""
538
+ # Remove extra spaces and convert to uppercase
539
+ dt = re.sub(r'\s+', ' ', str(dt).strip()).upper()
540
+ # Normalize special characters
541
+ dt = unidecode(dt)
542
+ # Remove any remaining special characters
543
+ dt = re.sub(r'[^A-Z0-9\s]', '', dt)
544
+ return dt
545
+
546
+ # Clean both inputs
547
+ wos_dt = clean_doctype(wos_dt)
548
+ scopus_dt = clean_doctype(scopus_dt)
549
+
550
+ # If both are empty, return empty string
551
+ if not wos_dt and not scopus_dt:
552
+ return ""
553
+
554
+ # If only one source has data, return that
555
+ if not wos_dt:
556
+ return scopus_dt
557
+ if not scopus_dt:
558
+ return wos_dt
559
+
560
+ # Split multiple document types if they exist
561
+ wos_types = set(dt.strip() for dt in wos_dt.split(';') if dt.strip())
562
+ scopus_types = set(dt.strip() for dt in scopus_dt.split(';') if dt.strip())
563
+
564
+ # Combine unique document types
565
+ all_types = wos_types | scopus_types
566
+
567
+ # Sort for consistency
568
+ return '; '.join(sorted(all_types))
569
+
570
+
571
+ def merge_url(wos_url: str, scopus_url: str) -> str:
572
+ """
573
+ Merges URL information from WoS and Scopus.
574
+ Prioritizes WoS URL if available, otherwise uses Scopus URL.
575
+
576
+ Args:
577
+ wos_url (str): URL from WoS
578
+ scopus_url (str): URL from Scopus (Link)
579
+
580
+ Returns:
581
+ str: URL, preferring WoS format when available
582
+ """
583
+ def clean_url(url):
584
+ if pd.isna(url) or not url:
585
+ return ""
586
+ return str(url).strip()
587
+
588
+ # Clean both URLs
589
+ wos_url = clean_url(wos_url)
590
+ scopus_url = clean_url(scopus_url)
591
+
592
+ # Return WoS URL if available, otherwise Scopus URL
593
+ return wos_url if wos_url else scopus_url
594
+
595
+ def merge_open_access(wos_oa: str, scopus_oa: str) -> str:
596
+ """
597
+ Merges Open Access information from WoS and Scopus.
598
+ Standardizes OA status and combines information from both sources.
599
+
600
+ Args:
601
+ wos_oa (str): Open Access status from WoS
602
+ scopus_oa (str): Open Access status from Scopus
603
+
604
+ Returns:
605
+ str: Standardized Open Access status
606
+ """
607
+ # OA status mapping dictionary
608
+ OA_STATUS_MAP = {
609
+ # Common variations
610
+ 'OPEN ACCESS': 'OPEN ACCESS',
611
+ 'OA': 'OPEN ACCESS',
612
+ 'GOLD': 'GOLD OPEN ACCESS',
613
+ 'GOLD OPEN ACCESS': 'GOLD OPEN ACCESS',
614
+ 'GREEN': 'GREEN OPEN ACCESS',
615
+ 'GREEN OPEN ACCESS': 'GREEN OPEN ACCESS',
616
+ 'BRONZE': 'BRONZE OPEN ACCESS',
617
+ 'BRONZE OPEN ACCESS': 'BRONZE OPEN ACCESS',
618
+ 'HYBRID': 'HYBRID OPEN ACCESS',
619
+ 'HYBRID OPEN ACCESS': 'HYBRID OPEN ACCESS',
620
+ # Additional variations
621
+ 'ALL OPEN ACCESS': 'OPEN ACCESS',
622
+ 'PUBLISHED': 'OPEN ACCESS',
623
+ 'FREE': 'OPEN ACCESS',
624
+ 'PUBLISHERFULLGOLD': 'GOLD OPEN ACCESS',
625
+ 'REPOSITORY': 'GREEN OPEN ACCESS',
626
+ # Non-OA variations
627
+ 'SUBSCRIPTION': 'NON OPEN ACCESS',
628
+ 'NON-OA': 'NON OPEN ACCESS',
629
+ 'CLOSED': 'NON OPEN ACCESS'
630
+ }
631
+
632
+ def standardize_oa_status(oa):
633
+ if pd.isna(oa) or not oa:
634
+ return ""
635
+ # Remove extra spaces and convert to uppercase
636
+ oa = re.sub(r'\s+', ' ', str(oa).strip()).upper()
637
+ # Normalize special characters
638
+ oa = unidecode(oa)
639
+ # Map to standard status if exists
640
+ return OA_STATUS_MAP.get(oa, oa)
641
+
642
+ # Clean and standardize both inputs
643
+ wos_oa = standardize_oa_status(wos_oa)
644
+ scopus_oa = standardize_oa_status(scopus_oa)
645
+
646
+ # If only one source has data, use that
647
+ if not wos_oa and scopus_oa:
648
+ return scopus_oa
649
+ if wos_oa and not scopus_oa:
650
+ return wos_oa
651
+
652
+ # If both sources have data and they're different
653
+ if wos_oa and scopus_oa and wos_oa != scopus_oa:
654
+ # Prefer more specific OA type over general "OPEN ACCESS"
655
+ if wos_oa == 'OPEN ACCESS':
656
+ return scopus_oa
657
+ if scopus_oa == 'OPEN ACCESS':
658
+ return wos_oa
659
+ # If both have specific types, prefer WoS
660
+ return wos_oa
661
+
662
+ # If both are the same or empty
663
+ return wos_oa or 'NON OPEN ACCESS'
664
+
665
+ def clean_scopus_author_fullnames(df: pd.DataFrame) -> pd.DataFrame:
666
+ """
667
+ Cleans Scopus author full names by removing IDs and converting to WoS format.
668
+ Example input: "CAO, NANNAN (58490132900)"
669
+ Example output: "CAO, NANNAN"
670
+
671
+ Args:
672
+ df (pd.DataFrame): DataFrame containing Scopus data
673
+
674
+ Returns:
675
+ pd.DataFrame: DataFrame with cleaned author full names
676
+ """
677
+ if 'AF' not in df.columns:
678
+ return df
679
+
680
+ def clean_author(author_str):
681
+ if pd.isna(author_str) or not author_str:
682
+ return ""
683
+
684
+ # Split multiple authors
685
+ authors = [a.strip() for a in author_str.split(';')]
686
+ cleaned_authors = []
687
+
688
+ for author in authors:
689
+ # Remove ID in parentheses
690
+ author = re.sub(r'\s*\([^)]*\)', '', author)
691
+ # Clean extra spaces
692
+ author = re.sub(r'\s+', ' ', author.strip())
693
+ if author:
694
+ cleaned_authors.append(author)
695
+
696
+ return '; '.join(cleaned_authors)
697
+
698
+ df['AF'] = df['AF'].apply(clean_author)
699
+ return df
700
+
701
+ def merge_source_title(wos_so: str, scopus_so: str) -> str:
702
+ """
703
+ Merges source title information from WoS and Scopus.
704
+ When both sources have data, prefers Scopus.
705
+ When only one source has data, uses that source.
706
+
707
+ Args:
708
+ wos_so (str): Source title from WoS
709
+ scopus_so (str): Source title from Scopus
710
+
711
+ Returns:
712
+ str: Source title, preferring Scopus when both exist
713
+ """
714
+ def clean_title(title):
715
+ if pd.isna(title) or not title:
716
+ return ""
717
+ # Remove extra spaces
718
+ title = re.sub(r'\s+', ' ', str(title).strip())
719
+ return title
720
+
721
+ # Clean both titles
722
+ wos_so = clean_title(wos_so)
723
+ scopus_so = clean_title(scopus_so)
724
+
725
+ # If Scopus has data, use it
726
+ if scopus_so:
727
+ return scopus_so
728
+ # Otherwise use WoS if it has data
729
+ if wos_so:
730
+ return wos_so
731
+ # If neither has data, return empty string
732
+ return ""
733
+
734
+ def merge_db_sources(*dataframes: pd.DataFrame, remove_duplicated: bool = True, merge_fields: bool = True, verbose: bool = False) -> pd.DataFrame:
735
+ """
736
+ Merges bibliometric data from different databases.
737
+ Combines information from different columns of the same records to create the most complete data.
738
+
739
+ Parameters:
740
+ -----------
741
+ *dataframes : pd.DataFrame
742
+ Bibliographic data frames to be merged
743
+ remove_duplicated : bool, default=True
744
+ If True, duplicate documents are removed
745
+ merge_fields : bool, default=True
746
+ If True, information from different columns of the same records is merged
747
+ verbose : bool, default=False
748
+ If True, prints information about duplicate documents
749
+
750
+ Returns:
751
+ --------
752
+ pd.DataFrame
753
+ Merged bibliographic data frame
754
+ """
755
+
756
+ if not dataframes:
757
+ raise ValueError("At least one data frame is required!")
758
+
759
+ # Clean Scopus author full names before merging
760
+ cleaned_dataframes = []
761
+ for df in dataframes:
762
+ if 'DB' in df.columns:
763
+ # Create temporary RP columns based on source
764
+ if df['DB'].iloc[0] == 'SCOPUS' and 'RP' in df.columns:
765
+ df['RP_SCOPUS'] = df['RP']
766
+ df['RP_WOS'] = ''
767
+ elif df['DB'].iloc[0] == 'ISI' and 'RP' in df.columns:
768
+ df['RP_WOS'] = df['RP']
769
+ df['RP_SCOPUS'] = ''
770
+
771
+ if 'DB' in df.columns and df['DB'].iloc[0] == 'SCOPUS':
772
+ df = clean_scopus_author_fullnames(df)
773
+ cleaned_dataframes.append(df)
774
+
775
+ # Merge data frames
776
+ M = pd.concat(cleaned_dataframes, ignore_index=True)
777
+ initial_size = len(M)
778
+
779
+ # Create DB_Original column to track original source databases
780
+ M['DB_Original'] = M['DB']
781
+
782
+ if remove_duplicated:
783
+ if merge_fields:
784
+ # Group by DOI and select the most complete data within each group
785
+ if 'DI' in M.columns:
786
+ # Group records with DOI
787
+ grouped = M[~M['DI'].isna()].groupby('DI', as_index=False).agg(
788
+ lambda x: '; '.join(sorted(set(str(val) for val in x if pd.notna(val)))) if x.name == 'DB_Original'
789
+ else merge_values(x)
790
+ )
791
+
792
+ # Update DB field for merged records
793
+ grouped.loc[grouped['DB_Original'].str.contains(';'), 'DB'] = 'BIBEXPY'
794
+
795
+ # Add records without DOI
796
+ no_doi = M[M['DI'].isna()]
797
+ M = pd.concat([grouped, no_doi], ignore_index=True)
798
+
799
+ # Check duplicates by title and year
800
+ if 'TI' in M.columns and 'PY' in M.columns:
801
+ # Clean titles
802
+ M['clean_title'] = M['TI'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
803
+ M['clean_title'] = M['clean_title'].apply(trim)
804
+
805
+ # Group by title and year
806
+ M['title_year'] = M['clean_title'] + ' ' + M['PY'].astype(str)
807
+
808
+ # Select the most complete data for each group
809
+ grouped = M.groupby('title_year', as_index=False).agg(
810
+ lambda x: '; '.join(sorted(set(str(val) for val in x if pd.notna(val)))) if x.name == 'DB_Original'
811
+ else merge_values(x)
812
+ )
813
+
814
+ # Update DB field for merged records
815
+ grouped.loc[grouped['DB_Original'].str.contains(';'), 'DB'] = 'BIBEXPY'
816
+
817
+ M = grouped.drop(['title_year', 'clean_title'], axis=1)
818
+ else:
819
+ # Just remove duplicate records
820
+ if 'DI' in M.columns:
821
+ duplicates = M['DI'].duplicated() & ~M['DI'].isna()
822
+ M = M[~duplicates]
823
+
824
+ if 'TI' in M.columns and 'PY' in M.columns:
825
+ clean_titles = M['TI'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
826
+ clean_titles = clean_titles.apply(trim)
827
+ title_year = clean_titles + ' ' + M['PY'].astype(str)
828
+ duplicates = title_year.duplicated()
829
+ M = M[~duplicates]
830
+
831
+ # If there are multiple databases
832
+ if len(M['DB'].unique()) > 1:
833
+ # DB'yi ISI'ya set edelim
834
+ M['DB'] = 'ISI'
835
+
836
+ # Complete WC and SC fields from each other
837
+ if 'WC' in M.columns and 'SC' in M.columns:
838
+ # Fill WC from SC if WC is empty
839
+ M['WC'] = M.apply(lambda row: row['SC'] if (pd.isna(row['WC']) or str(row['WC']).strip() == '') and pd.notna(row['SC']) else row['WC'], axis=1)
840
+ # Fill SC from WC if SC is empty
841
+ M['SC'] = M.apply(lambda row: row['WC'] if (pd.isna(row['SC']) or str(row['SC']).strip() == '') and pd.notna(row['WC']) else row['SC'], axis=1)
842
+
843
+ # Merge RP data using temporary columns
844
+ if 'RP_WOS' in M.columns and 'RP_SCOPUS' in M.columns:
845
+ M['RP'] = M.apply(lambda row: row['RP_WOS'] if pd.notna(row['RP_WOS']) and str(row['RP_WOS']).strip()
846
+ else (row['RP_SCOPUS'] if pd.notna(row['RP_SCOPUS']) and str(row['RP_SCOPUS']).strip() else ''),
847
+ axis=1)
848
+ # Drop temporary columns
849
+ M = M.drop(['RP_WOS', 'RP_SCOPUS'], axis=1)
850
+
851
+ # Clean author data using new merge function
852
+ if 'AU' in M.columns:
853
+ wos_data = M[M['DB_Original'] == 'ISI']
854
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
855
+
856
+ if not wos_data.empty and not scopus_data.empty:
857
+ for idx in M.index:
858
+ if pd.isna(M.at[idx, 'AU']):
859
+ continue
860
+
861
+ wos_authors = wos_data.at[idx, 'AU'] if idx in wos_data.index else ''
862
+ scopus_authors = scopus_data.at[idx, 'AU'] if idx in scopus_data.index else ''
863
+
864
+ if wos_authors and scopus_authors:
865
+ M.at[idx, 'AU'] = merge_author_fields(wos_authors, scopus_authors)
866
+
867
+ # Clean author full names using WoS format
868
+ if 'AF' in M.columns:
869
+ wos_data = M[M['DB_Original'] == 'ISI']
870
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
871
+
872
+ if not wos_data.empty and not scopus_data.empty:
873
+ for idx in M.index:
874
+ if pd.isna(M.at[idx, 'AF']):
875
+ continue
876
+
877
+ wos_af = wos_data.at[idx, 'AF'] if idx in wos_data.index else ''
878
+ scopus_af = scopus_data.at[idx, 'AF'] if idx in scopus_data.index else ''
879
+
880
+ if wos_af and scopus_af:
881
+ M.at[idx, 'AF'] = merge_author_fullnames(wos_af, scopus_af)
882
+
883
+ # Use Scopus source title when available, otherwise use WoS
884
+ if 'SO' in M.columns:
885
+ wos_data = M[M['DB_Original'] == 'ISI']
886
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
887
+
888
+ if not wos_data.empty and not scopus_data.empty:
889
+ for idx in M.index:
890
+ try:
891
+ # Get source titles from both sources using loc
892
+ wos_so = wos_data.loc[idx, 'SO'] if idx in wos_data.index else ''
893
+ scopus_so = scopus_data.loc[idx, 'SO'] if idx in scopus_data.index else ''
894
+
895
+ # Convert NaN to empty string
896
+ wos_so = '' if pd.isna(wos_so) else str(wos_so)
897
+ scopus_so = '' if pd.isna(scopus_so) else str(scopus_so)
898
+
899
+ # Merge source titles only if at least one source has data
900
+ if wos_so or scopus_so:
901
+ merged_so = merge_source_title(wos_so, scopus_so)
902
+ if merged_so:
903
+ M.loc[idx, 'SO'] = merged_so
904
+ except Exception as e:
905
+ print(f"Warning: Error merging source titles for index {idx}: {str(e)}")
906
+ # Use Scopus title if available, otherwise use WoS
907
+ if pd.notna(scopus_so):
908
+ M.loc[idx, 'SO'] = scopus_so
909
+ elif pd.notna(wos_so):
910
+ M.loc[idx, 'SO'] = wos_so
911
+ continue
912
+
913
+ # Use WoS journal abbreviation when available, otherwise use Scopus
914
+ if 'JI' in M.columns:
915
+ wos_data = M[M['DB_Original'] == 'ISI']
916
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
917
+
918
+ if not wos_data.empty and not scopus_data.empty:
919
+ for idx in M.index:
920
+ if pd.isna(M.at[idx, 'JI']):
921
+ continue
922
+
923
+ wos_ji = wos_data.at[idx, 'JI'] if idx in wos_data.index else ''
924
+ scopus_ji = scopus_data.at[idx, 'JI'] if idx in scopus_data.index else ''
925
+
926
+ # Prefer WoS format if available
927
+ if wos_ji and not pd.isna(wos_ji):
928
+ M.at[idx, 'JI'] = wos_ji
929
+ elif scopus_ji and not pd.isna(scopus_ji):
930
+ M.at[idx, 'JI'] = scopus_ji
931
+
932
+ # Clean addresses using WoS format
933
+ if 'C1' in M.columns:
934
+ # Get WoS and Scopus data
935
+ wos_data = M[M['DB_Original'] == 'ISI'].copy()
936
+ scopus_data = M[M['DB_Original'] == 'SCOPUS'].copy()
937
+
938
+ # Initialize C1 column if not exists
939
+ if 'C1' not in M.columns:
940
+ M['C1'] = ''
941
+
942
+ # Process each row
943
+ for idx in M.index:
944
+ try:
945
+ # Get original database source
946
+ db_source = M.at[idx, 'DB_Original']
947
+
948
+ # Get addresses based on source
949
+ if db_source == 'SCOPUS':
950
+ current_address = scopus_data.at[idx, 'C1'] if idx in scopus_data.index else ''
951
+ elif db_source == 'ISI':
952
+ current_address = wos_data.at[idx, 'C1'] if idx in wos_data.index else ''
953
+ else:
954
+ current_address = ''
955
+
956
+ # Clean and set the address
957
+ if pd.notna(current_address) and str(current_address).strip():
958
+ M.at[idx, 'C1'] = str(current_address).strip()
959
+
960
+ except Exception:
961
+ continue
962
+
963
+ # Clean and merge abstracts
964
+ if 'AB' in M.columns:
965
+ wos_data = M[M['DB_Original'] == 'ISI']
966
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
967
+
968
+ if not wos_data.empty and not scopus_data.empty:
969
+ for idx in M.index:
970
+ if pd.isna(M.at[idx, 'AB']):
971
+ continue
972
+
973
+ wos_ab = wos_data.at[idx, 'AB'] if idx in wos_data.index else ''
974
+ scopus_ab = scopus_data.at[idx, 'AB'] if idx in scopus_data.index else ''
975
+
976
+ if wos_ab or scopus_ab:
977
+ M.at[idx, 'AB'] = merge_abstracts(wos_ab, scopus_ab)
978
+
979
+ # Clean and merge author keywords
980
+ if 'DE' in M.columns:
981
+ wos_data = M[M['DB_Original'] == 'ISI']
982
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
983
+
984
+ if not wos_data.empty and not scopus_data.empty:
985
+ for idx in M.index:
986
+ if pd.isna(M.at[idx, 'DE']):
987
+ continue
988
+
989
+ wos_keywords = wos_data.at[idx, 'DE'] if idx in wos_data.index else ''
990
+ scopus_keywords = scopus_data.at[idx, 'DE'] if idx in scopus_data.index else ''
991
+
992
+ if wos_keywords or scopus_keywords:
993
+ M.at[idx, 'DE'] = merge_keywords(wos_keywords, scopus_keywords)
994
+
995
+ # Clean and merge index keywords
996
+ if 'ID' in M.columns:
997
+ wos_data = M[M['DB_Original'] == 'ISI']
998
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
999
+
1000
+ if not wos_data.empty and not scopus_data.empty:
1001
+ for idx in M.index:
1002
+ if pd.isna(M.at[idx, 'ID']):
1003
+ continue
1004
+
1005
+ wos_keywords = wos_data.at[idx, 'ID'] if idx in wos_data.index else ''
1006
+ scopus_keywords = scopus_data.at[idx, 'ID'] if idx in scopus_data.index else ''
1007
+
1008
+ if wos_keywords or scopus_keywords:
1009
+ M.at[idx, 'ID'] = merge_index_keywords(wos_keywords, scopus_keywords)
1010
+
1011
+ # Clean and merge references
1012
+ if 'CR' in M.columns:
1013
+ wos_data = M[M['DB_Original'] == 'ISI']
1014
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1015
+
1016
+ if not wos_data.empty and not scopus_data.empty:
1017
+ for idx in M.index:
1018
+ try:
1019
+ # Get references from both sources
1020
+ wos_refs = wos_data.loc[wos_data.index == idx, 'CR'].iloc[0] if idx in wos_data.index else ''
1021
+ scopus_refs = scopus_data.loc[scopus_data.index == idx, 'CR'].iloc[0] if idx in scopus_data.index else ''
1022
+
1023
+ # Convert NaN to empty string
1024
+ wos_refs = '' if pd.isna(wos_refs) else str(wos_refs)
1025
+ scopus_refs = '' if pd.isna(scopus_refs) else str(scopus_refs)
1026
+
1027
+ # Merge references only if at least one source has data
1028
+ if wos_refs or scopus_refs:
1029
+ merged_refs = merge_references(wos_refs, scopus_refs)
1030
+ if merged_refs:
1031
+ M.at[idx, 'CR'] = merged_refs
1032
+ except Exception as e:
1033
+ print(f"Error merging references for index {idx}: {str(e)}")
1034
+ continue
1035
+
1036
+ # Clean and merge publisher names
1037
+ if 'PU' in M.columns:
1038
+ wos_data = M[M['DB_Original'] == 'ISI']
1039
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1040
+
1041
+ if not wos_data.empty and not scopus_data.empty:
1042
+ for idx in M.index:
1043
+ if pd.isna(M.at[idx, 'PU']):
1044
+ continue
1045
+
1046
+ wos_pub = wos_data.at[idx, 'PU'] if idx in wos_data.index else ''
1047
+ scopus_pub = scopus_data.at[idx, 'PU'] if idx in scopus_data.index else ''
1048
+
1049
+ if wos_pub or scopus_pub:
1050
+ M.at[idx, 'PU'] = merge_publisher(wos_pub, scopus_pub)
1051
+
1052
+ # Clean and merge language information
1053
+ if 'LA' in M.columns:
1054
+ wos_data = M[M['DB_Original'] == 'ISI']
1055
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1056
+
1057
+ if not wos_data.empty and not scopus_data.empty:
1058
+ for idx in M.index:
1059
+ wos_lang = wos_data.at[idx, 'LA'] if idx in wos_data.index else ''
1060
+ scopus_lang = scopus_data.at[idx, 'LA'] if idx in scopus_data.index else ''
1061
+
1062
+ M.at[idx, 'LA'] = merge_language(wos_lang, scopus_lang)
1063
+
1064
+ # Clean and merge document types
1065
+ if 'DT' in M.columns:
1066
+ wos_data = M[M['DB_Original'] == 'ISI']
1067
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1068
+
1069
+ if not wos_data.empty and not scopus_data.empty:
1070
+ for idx in M.index:
1071
+ try:
1072
+ # Get document types from both sources using loc
1073
+ wos_dt = wos_data.loc[idx, 'DT'] if idx in wos_data.index else ''
1074
+ scopus_dt = scopus_data.loc[idx, 'DT'] if idx in scopus_data.index else ''
1075
+
1076
+ # Convert NaN to empty string
1077
+ wos_dt = '' if pd.isna(wos_dt) else str(wos_dt)
1078
+ scopus_dt = '' if pd.isna(scopus_dt) else str(scopus_dt)
1079
+
1080
+ # Merge document types only if at least one source has data
1081
+ if wos_dt or scopus_dt:
1082
+ merged_dt = merge_document_type(wos_dt, scopus_dt)
1083
+ if merged_dt:
1084
+ M.loc[idx, 'DT'] = merged_dt
1085
+ except Exception as e:
1086
+ print(f"Warning: Error merging document types for index {idx}: {str(e)}")
1087
+ # Use any available document type in case of error
1088
+ if pd.notna(wos_dt):
1089
+ M.loc[idx, 'DT'] = wos_dt
1090
+ elif pd.notna(scopus_dt):
1091
+ M.loc[idx, 'DT'] = scopus_dt
1092
+ continue
1093
+
1094
+ # Clean and merge unique identifiers
1095
+ if 'UT' in M.columns:
1096
+ wos_data = M[M['DB_Original'] == 'ISI']
1097
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1098
+
1099
+ if not wos_data.empty and not scopus_data.empty:
1100
+ for idx in M.index:
1101
+ # WoS verisi varsa onu kullan
1102
+ if idx in wos_data.index and pd.notna(wos_data.at[idx, 'UT']):
1103
+ M.at[idx, 'UT'] = wos_data.at[idx, 'UT']
1104
+ # WoS verisi yoksa ve Scopus verisi varsa Scopus'u kullan
1105
+ elif idx in scopus_data.index and pd.notna(scopus_data.at[idx, 'UT']):
1106
+ M.at[idx, 'UT'] = scopus_data.at[idx, 'UT']
1107
+
1108
+ # Clean and merge URLs
1109
+ if 'URL' in M.columns:
1110
+ wos_data = M[M['DB_Original'] == 'ISI']
1111
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1112
+
1113
+ if not wos_data.empty and not scopus_data.empty:
1114
+ for idx in M.index:
1115
+ wos_url = wos_data.at[idx, 'URL'] if idx in wos_data.index else ''
1116
+ scopus_url = scopus_data.at[idx, 'URL'] if idx in scopus_data.index else ''
1117
+
1118
+ M.at[idx, 'URL'] = merge_url(wos_url, scopus_url)
1119
+
1120
+ # Clean and merge Open Access status
1121
+ if 'OA' in M.columns:
1122
+ wos_data = M[M['DB_Original'] == 'ISI']
1123
+ scopus_data = M[M['DB_Original'] == 'SCOPUS']
1124
+
1125
+ if not wos_data.empty and not scopus_data.empty:
1126
+ for idx in M.index:
1127
+ wos_oa = wos_data.at[idx, 'OA'] if idx in wos_data.index else ''
1128
+ scopus_oa = scopus_data.at[idx, 'OA'] if idx in scopus_data.index else ''
1129
+
1130
+ M.at[idx, 'OA'] = merge_open_access(wos_oa, scopus_oa)
1131
+
1132
+ # Create SR tag
1133
+ M = meta_tag_extraction(M, 'SR')
1134
+
1135
+ return M
1136
+
1137
+ def main():
1138
+ try:
1139
+ print("Database Merge Tool")
1140
+ print("------------------")
1141
+
1142
+ # Find Excel files in rawData folder
1143
+ raw_data_path = "rawData"
1144
+ if not os.path.exists(raw_data_path):
1145
+ print("Error: rawData folder not found!")
1146
+ return
1147
+
1148
+ excel_files = [f for f in os.listdir(raw_data_path) if f.endswith('.xlsx')]
1149
+
1150
+ if not excel_files:
1151
+ print("Error: No Excel files found in rawData folder!")
1152
+ return
1153
+
1154
+ print(f"\nFound Excel files:")
1155
+ for i, file in enumerate(excel_files, 1):
1156
+ print(f"{i}. {file}")
1157
+
1158
+ dataframes = []
1159
+ for file in excel_files:
1160
+ try:
1161
+ file_path = os.path.join(raw_data_path, file)
1162
+ df = pd.read_excel(file_path)
1163
+ dataframes.append(df)
1164
+ print(f"\n{file} loaded successfully.")
1165
+ print(f"Record count: {len(df)}")
1166
+ except Exception as e:
1167
+ print(f"Error: Could not read file {file}: {str(e)}")
1168
+
1169
+ if not dataframes:
1170
+ print("\nNo files could be read!")
1171
+ return
1172
+
1173
+ print("\nMerging data...")
1174
+ merged_df = merge_db_sources(*dataframes, remove_duplicated=True, merge_fields=True, verbose=False)
1175
+
1176
+ # Save merged file
1177
+ output_file = "merged_data.xlsx"
1178
+ merged_df.to_excel(output_file, index=True) # SR will be used as index
1179
+
1180
+ print(f"\nMerged data saved to {output_file}")
1181
+ print(f"Total record count: {len(merged_df)}")
1182
+
1183
+ except Exception as e:
1184
+ print(f"\nAn unexpected error occurred: {str(e)}")
1185
+ finally:
1186
+ print("\nProgram terminating...")
1187
+
1188
+ if __name__ == "__main__":
1189
+ main()