scitex 2.17.3__py3-none-any.whl → 2.17.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/_dev/_dashboard/_routes.py +13 -0
- scitex/_dev/_dashboard/_scripts.py +144 -23
- scitex/_dev/_dashboard/_styles.py +90 -0
- scitex/_dev/_dashboard/_templates.py +14 -1
- scitex/_dev/_rtd.py +122 -0
- scitex/_dev/_ssh.py +38 -8
- scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +90 -0
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +1571 -0
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +6262 -0
- scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +1274 -0
- scitex/dev/plt/data/mpl/dir_ax.txt +459 -0
- scitex/scholar/_mcp/crossref_handlers.py +45 -7
- scitex/scholar/_mcp/openalex_handlers.py +45 -7
- scitex/scholar/config/default.yaml +2 -0
- scitex/scholar/data/.gitkeep +0 -0
- scitex/scholar/data/README.md +44 -0
- scitex/scholar/data/bib_files/bibliography.bib +1952 -0
- scitex/scholar/data/bib_files/neurovista.bib +277 -0
- scitex/scholar/data/bib_files/neurovista_enriched.bib +441 -0
- scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +441 -0
- scitex/scholar/data/bib_files/neurovista_processed.bib +338 -0
- scitex/scholar/data/bib_files/openaccess.bib +89 -0
- scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +2178 -0
- scitex/scholar/data/bib_files/pac.bib +698 -0
- scitex/scholar/data/bib_files/pac_enriched.bib +1061 -0
- scitex/scholar/data/bib_files/pac_processed.bib +0 -0
- scitex/scholar/data/bib_files/pac_titles.txt +75 -0
- scitex/scholar/data/bib_files/paywalled.bib +98 -0
- scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +58 -0
- scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +87 -0
- scitex/scholar/data/bib_files/seizure_prediction.bib +694 -0
- scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
- scitex/scholar/data/bib_files/test_complete_enriched.bib +437 -0
- scitex/scholar/data/bib_files/test_final_enriched.bib +437 -0
- scitex/scholar/data/bib_files/test_seizure.bib +46 -0
- scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
- scitex/scholar/data/impact_factor.db +0 -0
- scitex/scholar/local_dbs/__init__.py +5 -1
- scitex/scholar/local_dbs/export.py +93 -0
- scitex/scholar/local_dbs/unified.py +505 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +11 -0
- scitex/scholar/metadata_engines/individual/OpenAlexLocalEngine.py +346 -0
- scitex/scholar/metadata_engines/individual/__init__.py +1 -0
- {scitex-2.17.3.dist-info → scitex-2.17.4.dist-info}/METADATA +1 -1
- {scitex-2.17.3.dist-info → scitex-2.17.4.dist-info}/RECORD +51 -22
- scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +0 -462
- scitex/scholar/url_finder/.tmp/open_url/README.md +0 -223
- scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +0 -694
- scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +0 -1160
- scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +0 -344
- scitex/scholar/url_finder/.tmp/open_url/__init__.py +0 -24
- {scitex-2.17.3.dist-info → scitex-2.17.4.dist-info}/WHEEL +0 -0
- {scitex-2.17.3.dist-info → scitex-2.17.4.dist-info}/entry_points.txt +0 -0
- {scitex-2.17.3.dist-info → scitex-2.17.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,344 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
# Timestamp: "2025-07-29 03:10:08 (ywatanabe)"
|
|
4
|
-
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/open_url/_ResolverLinkFinder.py
|
|
5
|
-
# ----------------------------------------
|
|
6
|
-
from __future__ import annotations
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
__FILE__ = (
|
|
11
|
-
"./src/scitex/scholar/open_url/_ResolverLinkFinder.py"
|
|
12
|
-
)
|
|
13
|
-
__DIR__ = os.path.dirname(__FILE__)
|
|
14
|
-
# ----------------------------------------
|
|
15
|
-
|
|
16
|
-
"""Robust resolver link finder using a prioritized, multi-layered approach.
|
|
17
|
-
|
|
18
|
-
Priority order:
|
|
19
|
-
1. Link Target (domain matching) - Most reliable
|
|
20
|
-
2. Page Structure (CSS selectors) - Very reliable
|
|
21
|
-
3. Text Patterns - Good fallback
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
import re
|
|
25
|
-
from typing import List, Optional
|
|
26
|
-
from urllib.parse import urlparse
|
|
27
|
-
|
|
28
|
-
from playwright.async_api import ElementHandle, Page
|
|
29
|
-
|
|
30
|
-
from scitex import logging
|
|
31
|
-
|
|
32
|
-
logger = logging.getLogger(__name__)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ResolverLinkFinder:
|
|
36
|
-
"""Finds full-text links on resolver pages using multiple strategies."""
|
|
37
|
-
|
|
38
|
-
# DOI prefix to publisher domain mapping
|
|
39
|
-
DOI_TO_DOMAIN = {
|
|
40
|
-
"10.1038": [
|
|
41
|
-
"nature.com",
|
|
42
|
-
"springernature.com",
|
|
43
|
-
], # Nature Publishing Group
|
|
44
|
-
"10.1016": ["sciencedirect.com", "elsevier.com"], # Elsevier
|
|
45
|
-
"10.1002": ["wiley.com", "onlinelibrary.wiley.com"], # Wiley
|
|
46
|
-
"10.1007": ["springer.com", "link.springer.com"], # Springer
|
|
47
|
-
"10.1126": ["science.org", "sciencemag.org"], # Science/AAAS
|
|
48
|
-
"10.1021": ["acs.org", "pubs.acs.org"], # ACS Publications
|
|
49
|
-
"10.1111": [
|
|
50
|
-
"wiley.com",
|
|
51
|
-
"onlinelibrary.wiley.com",
|
|
52
|
-
], # Wiley (alternative)
|
|
53
|
-
"10.1080": ["tandfonline.com"], # Taylor & Francis
|
|
54
|
-
"10.1177": ["sagepub.com", "journals.sagepub.com"], # SAGE
|
|
55
|
-
"10.1093": ["oup.com", "academic.oup.com"], # Oxford
|
|
56
|
-
"10.1109": ["ieee.org", "ieeexplore.ieee.org"], # IEEE
|
|
57
|
-
"10.1371": ["plos.org", "journals.plos.org"], # PLOS
|
|
58
|
-
"10.1073": ["pnas.org"], # PNAS
|
|
59
|
-
"10.1136": ["bmj.com"], # BMJ
|
|
60
|
-
"10.3389": ["frontiersin.org"], # Frontiers
|
|
61
|
-
"10.3390": ["mdpi.com"], # MDPI
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
# Common resolver page structures
|
|
65
|
-
STRUCTURE_SELECTORS = [
|
|
66
|
-
# SFX (ExLibris) - used by many universities
|
|
67
|
-
"div#fulltext a",
|
|
68
|
-
"div.sfx-fulltext a",
|
|
69
|
-
"div.results-title > a",
|
|
70
|
-
"td.object-cell a",
|
|
71
|
-
".getFullTxt a",
|
|
72
|
-
'div[id*="fulltext"] a',
|
|
73
|
-
'div[class*="fulltext"] a',
|
|
74
|
-
# SFX specific selectors for University of Melbourne
|
|
75
|
-
"a[title*='Wiley Online Library']",
|
|
76
|
-
"a[href*='wiley.com']",
|
|
77
|
-
"a[href*='onlinelibrary.wiley.com']",
|
|
78
|
-
".sfx-target a",
|
|
79
|
-
".target a",
|
|
80
|
-
"td a[href*='wiley']",
|
|
81
|
-
# Primo (ExLibris)
|
|
82
|
-
"prm-full-view-service-container a",
|
|
83
|
-
"span.availability-status-available a",
|
|
84
|
-
# Summon (ProQuest)
|
|
85
|
-
".summon-fulltext-link",
|
|
86
|
-
"a.summon-link",
|
|
87
|
-
# EDS (EBSCO)
|
|
88
|
-
"a.fulltext-link",
|
|
89
|
-
".ft-link a",
|
|
90
|
-
# Generic patterns
|
|
91
|
-
"a.full-text-link",
|
|
92
|
-
"a.fulltext",
|
|
93
|
-
"a#full-text-link",
|
|
94
|
-
".access-link a",
|
|
95
|
-
".available-link a",
|
|
96
|
-
]
|
|
97
|
-
|
|
98
|
-
# Text patterns in priority order
|
|
99
|
-
TEXT_PATTERNS = [
|
|
100
|
-
# Most specific
|
|
101
|
-
"View full text at",
|
|
102
|
-
"Available from Nature",
|
|
103
|
-
"Available from ScienceDirect",
|
|
104
|
-
"Available from Wiley",
|
|
105
|
-
"Available from Wiley Online Library",
|
|
106
|
-
"Full text available from",
|
|
107
|
-
# Common patterns
|
|
108
|
-
"View full text",
|
|
109
|
-
"Full Text from Publisher",
|
|
110
|
-
"Get full text",
|
|
111
|
-
"Access full text",
|
|
112
|
-
"Go to article",
|
|
113
|
-
"Access article",
|
|
114
|
-
# Generic but reliable
|
|
115
|
-
"Full Text",
|
|
116
|
-
"Full text",
|
|
117
|
-
"Article",
|
|
118
|
-
"View",
|
|
119
|
-
"PDF",
|
|
120
|
-
"Download",
|
|
121
|
-
]
|
|
122
|
-
|
|
123
|
-
def __init__(self):
|
|
124
|
-
self._doi_pattern = re.compile(r"10\.\d{4,}/[-._;()/:\w]+")
|
|
125
|
-
|
|
126
|
-
def get_expected_domains(self, doi: str) -> List[str]:
|
|
127
|
-
"""Get expected publisher domains for a DOI."""
|
|
128
|
-
# Extract DOI prefix
|
|
129
|
-
match = re.match(r"(10\.\d{4,})", doi)
|
|
130
|
-
if not match:
|
|
131
|
-
return []
|
|
132
|
-
|
|
133
|
-
prefix = match.group(1)
|
|
134
|
-
return self.DOI_TO_DOMAIN.get(prefix, [])
|
|
135
|
-
|
|
136
|
-
async def find_link_async(self, page, doi: str) -> dict:
|
|
137
|
-
"""Find the best full-text link using prioritized strategies."""
|
|
138
|
-
logger.info(f"Finding resolver link for DOI: {doi}")
|
|
139
|
-
|
|
140
|
-
# Strategy 1: Link Target (Most Reliable)
|
|
141
|
-
link_url = await self._find_by_domain_async(page, doi)
|
|
142
|
-
if link_url:
|
|
143
|
-
logger.info("✓ Found link using domain matching (Strategy 1)")
|
|
144
|
-
return {"success": True, "url": link_url, "method": "domain"}
|
|
145
|
-
|
|
146
|
-
# Strategy 2: Page Structure with scoring
|
|
147
|
-
link_url = await self._find_by_structure_async(page, doi)
|
|
148
|
-
if link_url:
|
|
149
|
-
logger.info("✓ Found link using page structure (Strategy 2)")
|
|
150
|
-
return {"success": True, "url": link_url, "method": "structure"}
|
|
151
|
-
|
|
152
|
-
logger.warning("✗ No suitable links found")
|
|
153
|
-
return {"success": False, "url": None, "method": None}
|
|
154
|
-
|
|
155
|
-
async def _find_by_domain_async(self, page: Page, doi: str) -> Optional[str]:
|
|
156
|
-
"""Strategy 1: Find link by expected publisher domain."""
|
|
157
|
-
expected_domains = self.get_expected_domains(doi)
|
|
158
|
-
if not expected_domains:
|
|
159
|
-
logger.debug(f"No known publisher domains for DOI prefix: {doi}")
|
|
160
|
-
return None
|
|
161
|
-
|
|
162
|
-
logger.debug(f"Looking for links to domains: {expected_domains}")
|
|
163
|
-
all_links = await page.query_selector_all("a[href]")
|
|
164
|
-
|
|
165
|
-
for link in all_links:
|
|
166
|
-
href = await link.get_attribute("href")
|
|
167
|
-
if not href:
|
|
168
|
-
continue
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
parsed = urlparse(href)
|
|
172
|
-
domain = parsed.netloc.lower()
|
|
173
|
-
|
|
174
|
-
for expected in expected_domains:
|
|
175
|
-
if expected in domain:
|
|
176
|
-
text = await link.inner_text() or ""
|
|
177
|
-
logger.info(
|
|
178
|
-
f"Found domain match: {domain} (text: '{text[:50]}')"
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
if not any(
|
|
182
|
-
bad in text.lower()
|
|
183
|
-
for bad in ["abstract", "preview", "summary"]
|
|
184
|
-
):
|
|
185
|
-
return href
|
|
186
|
-
else:
|
|
187
|
-
logger.debug(
|
|
188
|
-
f"Skipping abstract/preview link: {text}"
|
|
189
|
-
)
|
|
190
|
-
except Exception as e:
|
|
191
|
-
logger.debug(f"Error parsing URL {href}: {e}")
|
|
192
|
-
|
|
193
|
-
return None
|
|
194
|
-
|
|
195
|
-
async def _find_by_structure_async(self, page, doi: str):
|
|
196
|
-
"""Find link by page structure with publisher prioritization."""
|
|
197
|
-
potential_links = []
|
|
198
|
-
expected_domains = self.get_expected_domains(doi)
|
|
199
|
-
publisher_keywords = [
|
|
200
|
-
domain.split(".")[0] for domain in expected_domains
|
|
201
|
-
]
|
|
202
|
-
aggregator_keywords = ["gale", "proquest", "ebsco", "jstor", "onefile"]
|
|
203
|
-
|
|
204
|
-
# Gather all possible links
|
|
205
|
-
for selector in self.STRUCTURE_SELECTORS:
|
|
206
|
-
try:
|
|
207
|
-
elements = await page.query_selector_all(selector)
|
|
208
|
-
logger.debug(
|
|
209
|
-
f"Found {len(elements)} elements with selector: {selector}"
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
for element in elements:
|
|
213
|
-
if await element.is_visible():
|
|
214
|
-
href = await element.get_attribute("href")
|
|
215
|
-
text = (await element.inner_text() or "").lower()
|
|
216
|
-
|
|
217
|
-
if href and href.strip():
|
|
218
|
-
potential_links.append(
|
|
219
|
-
{"href": href, "text": text, "score": 0}
|
|
220
|
-
)
|
|
221
|
-
except Exception as element_error:
|
|
222
|
-
logger.debug(
|
|
223
|
-
f"Error with selector '{selector}': {element_error}"
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
if not potential_links:
|
|
227
|
-
return None
|
|
228
|
-
|
|
229
|
-
# Score the links
|
|
230
|
-
for link in potential_links:
|
|
231
|
-
# Highest score for direct publisher match
|
|
232
|
-
if any(keyword in link["text"] for keyword in publisher_keywords):
|
|
233
|
-
link["score"] = 3
|
|
234
|
-
# High score for generic publisher
|
|
235
|
-
elif "publisher" in link["text"]:
|
|
236
|
-
link["score"] = 2
|
|
237
|
-
# Negative score for aggregators
|
|
238
|
-
elif any(
|
|
239
|
-
keyword in link["text"] for keyword in aggregator_keywords
|
|
240
|
-
):
|
|
241
|
-
link["score"] = -1
|
|
242
|
-
# Default neutral score
|
|
243
|
-
else:
|
|
244
|
-
link["score"] = 0
|
|
245
|
-
|
|
246
|
-
# Sort by score, highest first
|
|
247
|
-
sorted_links = sorted(
|
|
248
|
-
potential_links, key=lambda x: x["score"], reverse=True
|
|
249
|
-
)
|
|
250
|
-
best_link = sorted_links[0]
|
|
251
|
-
|
|
252
|
-
logger.debug(
|
|
253
|
-
f"Found structural match: '{best_link['text'][:50]}' -> {best_link['href']}"
|
|
254
|
-
)
|
|
255
|
-
return best_link["href"]
|
|
256
|
-
|
|
257
|
-
async def _find_by_text_async(self, page: Page) -> Optional[str]:
|
|
258
|
-
"""Strategy 3: Find link by text patterns."""
|
|
259
|
-
for pattern in self.TEXT_PATTERNS:
|
|
260
|
-
try:
|
|
261
|
-
selector = f'a:has-text("{pattern}")'
|
|
262
|
-
link = await page.query_selector(selector)
|
|
263
|
-
if link and await link.is_visible():
|
|
264
|
-
href = await link.get_attribute("href")
|
|
265
|
-
if href and href.strip():
|
|
266
|
-
logger.debug(
|
|
267
|
-
f"Found text match: '{pattern}' -> {href[:100]}"
|
|
268
|
-
)
|
|
269
|
-
return href
|
|
270
|
-
except Exception as e:
|
|
271
|
-
logger.debug(f"Error with text pattern '{pattern}': {e}")
|
|
272
|
-
|
|
273
|
-
return None
|
|
274
|
-
|
|
275
|
-
async def click_and_wait_async(self, page: Page, link: ElementHandle) -> bool:
|
|
276
|
-
"""Click link and wait for navigation.
|
|
277
|
-
|
|
278
|
-
Returns True if navigation succeeded.
|
|
279
|
-
"""
|
|
280
|
-
initial_url = page.url
|
|
281
|
-
|
|
282
|
-
try:
|
|
283
|
-
# Get link info for logging
|
|
284
|
-
href = await link.get_attribute("href") or ""
|
|
285
|
-
text = await link.inner_text() or ""
|
|
286
|
-
logger.info(f"Clicking link: '{text[:50]}' -> {href[:100]}")
|
|
287
|
-
|
|
288
|
-
# Click and wait for navigation
|
|
289
|
-
await link.click()
|
|
290
|
-
|
|
291
|
-
# Wait for either navigation or network idle
|
|
292
|
-
try:
|
|
293
|
-
await page.wait_for_load_state("networkidle", timeout=30000)
|
|
294
|
-
except:
|
|
295
|
-
# Fallback to domcontentloaded if network doesn't settle
|
|
296
|
-
await page.wait_for_load_state(
|
|
297
|
-
"domcontentloaded", timeout=30000
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
# Additional wait for JavaScript redirects
|
|
301
|
-
await page.wait_for_timeout(3000)
|
|
302
|
-
|
|
303
|
-
# Check if we navigated
|
|
304
|
-
final_url = page.url
|
|
305
|
-
if final_url != initial_url:
|
|
306
|
-
logger.info(
|
|
307
|
-
f"Successfully navigated: {initial_url} -> {final_url}"
|
|
308
|
-
)
|
|
309
|
-
return True
|
|
310
|
-
else:
|
|
311
|
-
logger.warning("No navigation occurred after click")
|
|
312
|
-
return False
|
|
313
|
-
|
|
314
|
-
except Exception as e:
|
|
315
|
-
logger.error(f"Error during click and navigation: {e}")
|
|
316
|
-
return False
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# Convenience function for integration
|
|
320
|
-
async def find_and_click_resolver_link_async(page: Page, doi: str) -> Optional[str]:
|
|
321
|
-
"""Find and click the best resolver link.
|
|
322
|
-
|
|
323
|
-
Args:
|
|
324
|
-
page: Playwright page object
|
|
325
|
-
doi: Target DOI
|
|
326
|
-
|
|
327
|
-
Returns:
|
|
328
|
-
Final URL after navigation, or None if failed
|
|
329
|
-
"""
|
|
330
|
-
finder = ResolverLinkFinder()
|
|
331
|
-
|
|
332
|
-
# Find link
|
|
333
|
-
link = await finder.find_link_async(page, doi)
|
|
334
|
-
if not link:
|
|
335
|
-
return None
|
|
336
|
-
|
|
337
|
-
# Click and navigate
|
|
338
|
-
success = await finder.click_and_wait_async(page, link)
|
|
339
|
-
if success:
|
|
340
|
-
return page.url
|
|
341
|
-
else:
|
|
342
|
-
return None
|
|
343
|
-
|
|
344
|
-
# EOF
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
# Timestamp: "2025-07-31 00:53:24 (ywatanabe)"
|
|
4
|
-
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/open_url/__init__.py
|
|
5
|
-
# ----------------------------------------
|
|
6
|
-
from __future__ import annotations
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
__FILE__ = (
|
|
11
|
-
"./src/scitex/scholar/open_url/__init__.py"
|
|
12
|
-
)
|
|
13
|
-
__DIR__ = os.path.dirname(__FILE__)
|
|
14
|
-
# ----------------------------------------
|
|
15
|
-
|
|
16
|
-
from ._DOIToURLResolver import DOIToURLResolver
|
|
17
|
-
from ._OpenURLResolver import OpenURLResolver
|
|
18
|
-
|
|
19
|
-
__all__ = [
|
|
20
|
-
"OpenURLResolver",
|
|
21
|
-
"DOIToURLResolver",
|
|
22
|
-
]
|
|
23
|
-
|
|
24
|
-
# EOF
|
|
File without changes
|
|
File without changes
|
|
File without changes
|