scitex 2.15.2__py3-none-any.whl → 2.15.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scitex/_mcp_resources/__init__.py +2 -0
  2. scitex/_mcp_resources/_scholar.py +148 -0
  3. scitex/_mcp_tools/scholar.py +50 -99
  4. scitex/_mcp_tools/social.py +15 -232
  5. scitex/_mcp_tools/writer.py +7 -17
  6. scitex/canvas/mcp_server.py +16 -3
  7. scitex/capture/mcp_server.py +16 -2
  8. scitex/cli/audio.py +90 -20
  9. scitex/cli/capture.py +120 -0
  10. scitex/cli/introspect.py +19 -12
  11. scitex/cli/plt.py +78 -21
  12. scitex/cli/scholar/__init__.py +160 -2
  13. scitex/cli/scholar/_crossref_scitex.py +25 -266
  14. scitex/cli/scholar/_openalex_scitex.py +55 -0
  15. scitex/cli/social.py +63 -22
  16. scitex/cli/stats.py +121 -3
  17. scitex/cli/writer.py +49 -423
  18. scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +90 -0
  19. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +1571 -0
  20. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +6262 -0
  21. scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +1274 -0
  22. scitex/dev/plt/data/mpl/dir_ax.txt +459 -0
  23. scitex/introspect/_list_api.py +5 -2
  24. scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +2 -2
  25. scitex/scholar/__init__.py +14 -9
  26. scitex/scholar/_mcp/crossref_tool_schemas.py +133 -0
  27. scitex/scholar/_mcp/openalex_handlers.py +212 -0
  28. scitex/scholar/_mcp/openalex_tool_schemas.py +96 -0
  29. scitex/scholar/_mcp/tool_schemas.py +16 -1
  30. scitex/scholar/data/.gitkeep +0 -0
  31. scitex/scholar/data/README.md +44 -0
  32. scitex/scholar/data/bib_files/bibliography.bib +1952 -0
  33. scitex/scholar/data/bib_files/neurovista.bib +277 -0
  34. scitex/scholar/data/bib_files/neurovista_enriched.bib +441 -0
  35. scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +441 -0
  36. scitex/scholar/data/bib_files/neurovista_processed.bib +338 -0
  37. scitex/scholar/data/bib_files/openaccess.bib +89 -0
  38. scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +2178 -0
  39. scitex/scholar/data/bib_files/pac.bib +698 -0
  40. scitex/scholar/data/bib_files/pac_enriched.bib +1061 -0
  41. scitex/scholar/data/bib_files/pac_processed.bib +0 -0
  42. scitex/scholar/data/bib_files/pac_titles.txt +75 -0
  43. scitex/scholar/data/bib_files/paywalled.bib +98 -0
  44. scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +58 -0
  45. scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +87 -0
  46. scitex/scholar/data/bib_files/seizure_prediction.bib +694 -0
  47. scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
  48. scitex/scholar/data/bib_files/test_complete_enriched.bib +437 -0
  49. scitex/scholar/data/bib_files/test_final_enriched.bib +437 -0
  50. scitex/scholar/data/bib_files/test_seizure.bib +46 -0
  51. scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
  52. scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
  53. scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
  54. scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
  55. scitex/scholar/data/impact_factor.db +0 -0
  56. scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +2 -2
  57. scitex/scholar/local_dbs/__init__.py +31 -0
  58. scitex/scholar/local_dbs/crossref_scitex.py +30 -0
  59. scitex/scholar/local_dbs/openalex_scitex.py +30 -0
  60. scitex/scholar/mcp_server.py +59 -4
  61. scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +2 -2
  62. scitex/stats/mcp_server.py +16 -3
  63. scitex/template/mcp_server.py +16 -3
  64. scitex/ui/mcp_server.py +16 -3
  65. scitex/writer/__init__.py +43 -34
  66. {scitex-2.15.2.dist-info → scitex-2.15.4.dist-info}/METADATA +22 -3
  67. {scitex-2.15.2.dist-info → scitex-2.15.4.dist-info}/RECORD +70 -38
  68. scitex/scholar/crossref_scitex.py +0 -367
  69. scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +0 -462
  70. scitex/scholar/url_finder/.tmp/open_url/README.md +0 -223
  71. scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +0 -694
  72. scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +0 -1160
  73. scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +0 -344
  74. scitex/scholar/url_finder/.tmp/open_url/__init__.py +0 -24
  75. {scitex-2.15.2.dist-info → scitex-2.15.4.dist-info}/WHEEL +0 -0
  76. {scitex-2.15.2.dist-info → scitex-2.15.4.dist-info}/entry_points.txt +0 -0
  77. {scitex-2.15.2.dist-info → scitex-2.15.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,344 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # Timestamp: "2025-07-29 03:10:08 (ywatanabe)"
4
- # File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/open_url/_ResolverLinkFinder.py
5
- # ----------------------------------------
6
- from __future__ import annotations
7
-
8
- import os
9
-
10
- __FILE__ = (
11
- "./src/scitex/scholar/open_url/_ResolverLinkFinder.py"
12
- )
13
- __DIR__ = os.path.dirname(__FILE__)
14
- # ----------------------------------------
15
-
16
- """Robust resolver link finder using a prioritized, multi-layered approach.
17
-
18
- Priority order:
19
- 1. Link Target (domain matching) - Most reliable
20
- 2. Page Structure (CSS selectors) - Very reliable
21
- 3. Text Patterns - Good fallback
22
- """
23
-
24
- import re
25
- from typing import List, Optional
26
- from urllib.parse import urlparse
27
-
28
- from playwright.async_api import ElementHandle, Page
29
-
30
- from scitex import logging
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- class ResolverLinkFinder:
36
- """Finds full-text links on resolver pages using multiple strategies."""
37
-
38
- # DOI prefix to publisher domain mapping
39
- DOI_TO_DOMAIN = {
40
- "10.1038": [
41
- "nature.com",
42
- "springernature.com",
43
- ], # Nature Publishing Group
44
- "10.1016": ["sciencedirect.com", "elsevier.com"], # Elsevier
45
- "10.1002": ["wiley.com", "onlinelibrary.wiley.com"], # Wiley
46
- "10.1007": ["springer.com", "link.springer.com"], # Springer
47
- "10.1126": ["science.org", "sciencemag.org"], # Science/AAAS
48
- "10.1021": ["acs.org", "pubs.acs.org"], # ACS Publications
49
- "10.1111": [
50
- "wiley.com",
51
- "onlinelibrary.wiley.com",
52
- ], # Wiley (alternative)
53
- "10.1080": ["tandfonline.com"], # Taylor & Francis
54
- "10.1177": ["sagepub.com", "journals.sagepub.com"], # SAGE
55
- "10.1093": ["oup.com", "academic.oup.com"], # Oxford
56
- "10.1109": ["ieee.org", "ieeexplore.ieee.org"], # IEEE
57
- "10.1371": ["plos.org", "journals.plos.org"], # PLOS
58
- "10.1073": ["pnas.org"], # PNAS
59
- "10.1136": ["bmj.com"], # BMJ
60
- "10.3389": ["frontiersin.org"], # Frontiers
61
- "10.3390": ["mdpi.com"], # MDPI
62
- }
63
-
64
- # Common resolver page structures
65
- STRUCTURE_SELECTORS = [
66
- # SFX (ExLibris) - used by many universities
67
- "div#fulltext a",
68
- "div.sfx-fulltext a",
69
- "div.results-title > a",
70
- "td.object-cell a",
71
- ".getFullTxt a",
72
- 'div[id*="fulltext"] a',
73
- 'div[class*="fulltext"] a',
74
- # SFX specific selectors for University of Melbourne
75
- "a[title*='Wiley Online Library']",
76
- "a[href*='wiley.com']",
77
- "a[href*='onlinelibrary.wiley.com']",
78
- ".sfx-target a",
79
- ".target a",
80
- "td a[href*='wiley']",
81
- # Primo (ExLibris)
82
- "prm-full-view-service-container a",
83
- "span.availability-status-available a",
84
- # Summon (ProQuest)
85
- ".summon-fulltext-link",
86
- "a.summon-link",
87
- # EDS (EBSCO)
88
- "a.fulltext-link",
89
- ".ft-link a",
90
- # Generic patterns
91
- "a.full-text-link",
92
- "a.fulltext",
93
- "a#full-text-link",
94
- ".access-link a",
95
- ".available-link a",
96
- ]
97
-
98
- # Text patterns in priority order
99
- TEXT_PATTERNS = [
100
- # Most specific
101
- "View full text at",
102
- "Available from Nature",
103
- "Available from ScienceDirect",
104
- "Available from Wiley",
105
- "Available from Wiley Online Library",
106
- "Full text available from",
107
- # Common patterns
108
- "View full text",
109
- "Full Text from Publisher",
110
- "Get full text",
111
- "Access full text",
112
- "Go to article",
113
- "Access article",
114
- # Generic but reliable
115
- "Full Text",
116
- "Full text",
117
- "Article",
118
- "View",
119
- "PDF",
120
- "Download",
121
- ]
122
-
123
- def __init__(self):
124
- self._doi_pattern = re.compile(r"10\.\d{4,}/[-._;()/:\w]+")
125
-
126
- def get_expected_domains(self, doi: str) -> List[str]:
127
- """Get expected publisher domains for a DOI."""
128
- # Extract DOI prefix
129
- match = re.match(r"(10\.\d{4,})", doi)
130
- if not match:
131
- return []
132
-
133
- prefix = match.group(1)
134
- return self.DOI_TO_DOMAIN.get(prefix, [])
135
-
136
- async def find_link_async(self, page, doi: str) -> dict:
137
- """Find the best full-text link using prioritized strategies."""
138
- logger.info(f"Finding resolver link for DOI: {doi}")
139
-
140
- # Strategy 1: Link Target (Most Reliable)
141
- link_url = await self._find_by_domain_async(page, doi)
142
- if link_url:
143
- logger.info("✓ Found link using domain matching (Strategy 1)")
144
- return {"success": True, "url": link_url, "method": "domain"}
145
-
146
- # Strategy 2: Page Structure with scoring
147
- link_url = await self._find_by_structure_async(page, doi)
148
- if link_url:
149
- logger.info("✓ Found link using page structure (Strategy 2)")
150
- return {"success": True, "url": link_url, "method": "structure"}
151
-
152
- logger.warning("✗ No suitable links found")
153
- return {"success": False, "url": None, "method": None}
154
-
155
- async def _find_by_domain_async(self, page: Page, doi: str) -> Optional[str]:
156
- """Strategy 1: Find link by expected publisher domain."""
157
- expected_domains = self.get_expected_domains(doi)
158
- if not expected_domains:
159
- logger.debug(f"No known publisher domains for DOI prefix: {doi}")
160
- return None
161
-
162
- logger.debug(f"Looking for links to domains: {expected_domains}")
163
- all_links = await page.query_selector_all("a[href]")
164
-
165
- for link in all_links:
166
- href = await link.get_attribute("href")
167
- if not href:
168
- continue
169
-
170
- try:
171
- parsed = urlparse(href)
172
- domain = parsed.netloc.lower()
173
-
174
- for expected in expected_domains:
175
- if expected in domain:
176
- text = await link.inner_text() or ""
177
- logger.info(
178
- f"Found domain match: {domain} (text: '{text[:50]}')"
179
- )
180
-
181
- if not any(
182
- bad in text.lower()
183
- for bad in ["abstract", "preview", "summary"]
184
- ):
185
- return href
186
- else:
187
- logger.debug(
188
- f"Skipping abstract/preview link: {text}"
189
- )
190
- except Exception as e:
191
- logger.debug(f"Error parsing URL {href}: {e}")
192
-
193
- return None
194
-
195
- async def _find_by_structure_async(self, page, doi: str):
196
- """Find link by page structure with publisher prioritization."""
197
- potential_links = []
198
- expected_domains = self.get_expected_domains(doi)
199
- publisher_keywords = [
200
- domain.split(".")[0] for domain in expected_domains
201
- ]
202
- aggregator_keywords = ["gale", "proquest", "ebsco", "jstor", "onefile"]
203
-
204
- # Gather all possible links
205
- for selector in self.STRUCTURE_SELECTORS:
206
- try:
207
- elements = await page.query_selector_all(selector)
208
- logger.debug(
209
- f"Found {len(elements)} elements with selector: {selector}"
210
- )
211
-
212
- for element in elements:
213
- if await element.is_visible():
214
- href = await element.get_attribute("href")
215
- text = (await element.inner_text() or "").lower()
216
-
217
- if href and href.strip():
218
- potential_links.append(
219
- {"href": href, "text": text, "score": 0}
220
- )
221
- except Exception as element_error:
222
- logger.debug(
223
- f"Error with selector '{selector}': {element_error}"
224
- )
225
-
226
- if not potential_links:
227
- return None
228
-
229
- # Score the links
230
- for link in potential_links:
231
- # Highest score for direct publisher match
232
- if any(keyword in link["text"] for keyword in publisher_keywords):
233
- link["score"] = 3
234
- # High score for generic publisher
235
- elif "publisher" in link["text"]:
236
- link["score"] = 2
237
- # Negative score for aggregators
238
- elif any(
239
- keyword in link["text"] for keyword in aggregator_keywords
240
- ):
241
- link["score"] = -1
242
- # Default neutral score
243
- else:
244
- link["score"] = 0
245
-
246
- # Sort by score, highest first
247
- sorted_links = sorted(
248
- potential_links, key=lambda x: x["score"], reverse=True
249
- )
250
- best_link = sorted_links[0]
251
-
252
- logger.debug(
253
- f"Found structural match: '{best_link['text'][:50]}' -> {best_link['href']}"
254
- )
255
- return best_link["href"]
256
-
257
- async def _find_by_text_async(self, page: Page) -> Optional[str]:
258
- """Strategy 3: Find link by text patterns."""
259
- for pattern in self.TEXT_PATTERNS:
260
- try:
261
- selector = f'a:has-text("{pattern}")'
262
- link = await page.query_selector(selector)
263
- if link and await link.is_visible():
264
- href = await link.get_attribute("href")
265
- if href and href.strip():
266
- logger.debug(
267
- f"Found text match: '{pattern}' -> {href[:100]}"
268
- )
269
- return href
270
- except Exception as e:
271
- logger.debug(f"Error with text pattern '{pattern}': {e}")
272
-
273
- return None
274
-
275
- async def click_and_wait_async(self, page: Page, link: ElementHandle) -> bool:
276
- """Click link and wait for navigation.
277
-
278
- Returns True if navigation succeeded.
279
- """
280
- initial_url = page.url
281
-
282
- try:
283
- # Get link info for logging
284
- href = await link.get_attribute("href") or ""
285
- text = await link.inner_text() or ""
286
- logger.info(f"Clicking link: '{text[:50]}' -> {href[:100]}")
287
-
288
- # Click and wait for navigation
289
- await link.click()
290
-
291
- # Wait for either navigation or network idle
292
- try:
293
- await page.wait_for_load_state("networkidle", timeout=30000)
294
- except:
295
- # Fallback to domcontentloaded if network doesn't settle
296
- await page.wait_for_load_state(
297
- "domcontentloaded", timeout=30000
298
- )
299
-
300
- # Additional wait for JavaScript redirects
301
- await page.wait_for_timeout(3000)
302
-
303
- # Check if we navigated
304
- final_url = page.url
305
- if final_url != initial_url:
306
- logger.info(
307
- f"Successfully navigated: {initial_url} -> {final_url}"
308
- )
309
- return True
310
- else:
311
- logger.warning("No navigation occurred after click")
312
- return False
313
-
314
- except Exception as e:
315
- logger.error(f"Error during click and navigation: {e}")
316
- return False
317
-
318
-
319
- # Convenience function for integration
320
- async def find_and_click_resolver_link_async(page: Page, doi: str) -> Optional[str]:
321
- """Find and click the best resolver link.
322
-
323
- Args:
324
- page: Playwright page object
325
- doi: Target DOI
326
-
327
- Returns:
328
- Final URL after navigation, or None if failed
329
- """
330
- finder = ResolverLinkFinder()
331
-
332
- # Find link
333
- link = await finder.find_link_async(page, doi)
334
- if not link:
335
- return None
336
-
337
- # Click and navigate
338
- success = await finder.click_and_wait_async(page, link)
339
- if success:
340
- return page.url
341
- else:
342
- return None
343
-
344
- # EOF
@@ -1,24 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # Timestamp: "2025-07-31 00:53:24 (ywatanabe)"
4
- # File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/open_url/__init__.py
5
- # ----------------------------------------
6
- from __future__ import annotations
7
-
8
- import os
9
-
10
- __FILE__ = (
11
- "./src/scitex/scholar/open_url/__init__.py"
12
- )
13
- __DIR__ = os.path.dirname(__FILE__)
14
- # ----------------------------------------
15
-
16
- from ._DOIToURLResolver import DOIToURLResolver
17
- from ._OpenURLResolver import OpenURLResolver
18
-
19
- __all__ = [
20
- "OpenURLResolver",
21
- "DOIToURLResolver",
22
- ]
23
-
24
- # EOF