scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. scitex/__version__.py +1 -1
  2. scitex/browser/__init__.py +53 -0
  3. scitex/browser/auth/__init__.py +35 -0
  4. scitex/browser/auth/google.py +381 -0
  5. scitex/browser/collaboration/__init__.py +5 -0
  6. scitex/browser/debugging/__init__.py +56 -0
  7. scitex/browser/debugging/_failure_capture.py +372 -0
  8. scitex/browser/debugging/_sync_session.py +259 -0
  9. scitex/browser/debugging/_test_monitor.py +284 -0
  10. scitex/browser/debugging/_visual_cursor.py +432 -0
  11. scitex/scholar/citation_graph/README.md +117 -0
  12. scitex/scholar/citation_graph/__init__.py +29 -0
  13. scitex/scholar/citation_graph/builder.py +214 -0
  14. scitex/scholar/citation_graph/database.py +246 -0
  15. scitex/scholar/citation_graph/example.py +96 -0
  16. scitex/scholar/citation_graph/models.py +80 -0
  17. scitex/scholar/config/ScholarConfig.py +23 -3
  18. scitex/scholar/config/default.yaml +56 -0
  19. scitex/scholar/core/Paper.py +102 -0
  20. scitex/scholar/core/__init__.py +44 -0
  21. scitex/scholar/core/journal_normalizer.py +524 -0
  22. scitex/scholar/core/oa_cache.py +285 -0
  23. scitex/scholar/core/open_access.py +457 -0
  24. scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
  25. scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
  26. scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
  27. scitex/scholar/pdf_download/strategies/__init__.py +6 -0
  28. scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
  29. scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
  30. scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
  31. scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
  32. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
  33. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
  34. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
  35. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
  36. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/core/open_access.py
4
+ """
5
+ Open Access Detection Module.
6
+
7
+ Provides utilities for determining if a paper is open access based on:
8
+ - Known open access sources (arXiv, PMC, bioRxiv, etc.)
9
+ - Unpaywall API lookup
10
+ - Publisher patterns
11
+ - Journal DOAJ status
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ from dataclasses import dataclass
18
+ from enum import Enum
19
+ from typing import Optional, List, Dict, Any
20
+ import asyncio
21
+ import aiohttp
22
+
23
+ from scitex import logging
24
+ from scitex.scholar.config import ScholarConfig
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Load OA config from default.yaml (single source of truth)
29
+ _config = None
30
+
31
+ def _get_config() -> ScholarConfig:
32
+ """Get or create singleton config instance."""
33
+ global _config
34
+ if _config is None:
35
+ _config = ScholarConfig()
36
+ return _config
37
+
38
+
39
+ def _get_oa_sources() -> frozenset:
40
+ """Get OA sources from config (single source of truth)."""
41
+ config = _get_config()
42
+ sources = config.get("OPENACCESS_SOURCES") or []
43
+ return frozenset(s.lower() for s in sources)
44
+
45
+
46
+ def _get_oa_journals() -> tuple:
47
+ """Get OA journal patterns from config (single source of truth)."""
48
+ config = _get_config()
49
+ journals = config.get("OPENACCESS_JOURNALS") or []
50
+ return tuple(j.lower() for j in journals)
51
+
52
+
53
+ def _get_unpaywall_email() -> str:
54
+ """Get Unpaywall API email from config."""
55
+ config = _get_config()
56
+ return config.get("unpaywall_email") or "research@scitex.io"
57
+
58
+
59
+ class OAStatus(Enum):
60
+ """Open Access status categories (aligned with Unpaywall)."""
61
+ GOLD = "gold" # Published in OA journal (DOAJ listed)
62
+ GREEN = "green" # Available in repository (arXiv, PMC, etc.)
63
+ HYBRID = "hybrid" # OA article in subscription journal
64
+ BRONZE = "bronze" # Free to read on publisher site, but no license
65
+ CLOSED = "closed" # Paywalled
66
+ UNKNOWN = "unknown" # Status not determined
67
+
68
+
69
+ @dataclass
70
+ class OAResult:
71
+ """Result of open access detection."""
72
+ is_open_access: bool
73
+ status: OAStatus
74
+ oa_url: Optional[str] = None
75
+ source: Optional[str] = None # How we determined OA status
76
+ license: Optional[str] = None
77
+ confidence: float = 1.0 # 0-1, how confident we are
78
+
79
+
80
+ # Open Access Sources and Journals are loaded from config/default.yaml
81
+ # These properties provide lazy-loaded access to config values
82
+ # (single source of truth: config/default.yaml → OPENACCESS_SOURCES, OPENACCESS_JOURNALS)
83
+
84
+ # arXiv ID patterns
85
+ ARXIV_PATTERNS = [
86
+ re.compile(r"^\d{4}\.\d{4,5}(v\d+)?$"), # New format: 2301.12345
87
+ re.compile(r"^[a-z-]+/\d{7}(v\d+)?$"), # Old format: hep-th/9901001
88
+ re.compile(r"^arxiv:\d{4}\.\d{4,5}(v\d+)?$", re.IGNORECASE),
89
+ ]
90
+
91
+
92
+ def is_arxiv_id(identifier: str) -> bool:
93
+ """Check if identifier looks like an arXiv ID."""
94
+ if not identifier:
95
+ return False
96
+ identifier = identifier.strip()
97
+ return any(p.match(identifier) for p in ARXIV_PATTERNS)
98
+
99
+
100
+ def is_open_access_source(source: str) -> bool:
101
+ """Check if source is a known open access repository.
102
+
103
+ Sources are loaded from config/default.yaml → OPENACCESS_SOURCES
104
+ """
105
+ if not source:
106
+ return False
107
+ return source.lower() in _get_oa_sources()
108
+
109
+
110
+ def is_open_access_journal(journal_name: str, use_cache: bool = True) -> bool:
111
+ """Check if journal is a known open access journal.
112
+
113
+ Uses three-tier lookup:
114
+ 1. Fast check against config/default.yaml → OPENACCESS_JOURNALS (pattern matching)
115
+ 2. Comprehensive check against cached OpenAlex OA sources (exact match, 62K+ journals)
116
+ 3. Journal normalizer check (handles abbreviations, variants, historical names)
117
+
118
+ Args:
119
+ journal_name: Journal name to check
120
+ use_cache: Whether to use OpenAlex cache (default True)
121
+
122
+ Returns:
123
+ True if journal is known to be Open Access
124
+ """
125
+ if not journal_name:
126
+ return False
127
+
128
+ journal_lower = journal_name.lower()
129
+
130
+ # Tier 1: Fast pattern match from YAML config
131
+ if any(oa_journal in journal_lower for oa_journal in _get_oa_journals()):
132
+ return True
133
+
134
+ # Tier 2: Check OpenAlex cache (62K+ OA sources)
135
+ if use_cache:
136
+ try:
137
+ from .oa_cache import is_oa_journal_cached
138
+ if is_oa_journal_cached(journal_name):
139
+ return True
140
+ except ImportError:
141
+ pass # Cache module not available
142
+
143
+ # Tier 3: Use journal normalizer (handles abbreviations, variants)
144
+ if use_cache:
145
+ try:
146
+ from .journal_normalizer import get_journal_normalizer
147
+ normalizer = get_journal_normalizer()
148
+ if normalizer.is_open_access(journal_name):
149
+ return True
150
+ except ImportError:
151
+ pass # Normalizer module not available
152
+
153
+ return False
154
+
155
+
156
+ def detect_oa_from_identifiers(
157
+ doi: Optional[str] = None,
158
+ arxiv_id: Optional[str] = None,
159
+ pmcid: Optional[str] = None,
160
+ source: Optional[str] = None,
161
+ journal: Optional[str] = None,
162
+ is_open_access_flag: Optional[bool] = None,
163
+ ) -> OAResult:
164
+ """
165
+ Detect open access status from paper identifiers without API calls.
166
+
167
+ This is fast but may miss some OA papers (e.g., hybrid articles).
168
+ For comprehensive detection, use check_oa_status_async() with Unpaywall.
169
+
170
+ Args:
171
+ doi: Paper DOI
172
+ arxiv_id: arXiv identifier
173
+ pmcid: PubMed Central ID (starts with PMC)
174
+ source: Source database (arxiv, pmc, biorxiv, etc.)
175
+ journal: Journal name
176
+ is_open_access_flag: Pre-existing OA flag from search API
177
+
178
+ Returns:
179
+ OAResult with detection results
180
+ """
181
+ # If we already have an OA flag from a reliable source, trust it
182
+ if is_open_access_flag is True:
183
+ return OAResult(
184
+ is_open_access=True,
185
+ status=OAStatus.UNKNOWN, # We don't know the specific type
186
+ source="api_flag",
187
+ confidence=0.9,
188
+ )
189
+
190
+ # arXiv - always open access (GREEN)
191
+ if arxiv_id and is_arxiv_id(arxiv_id):
192
+ return OAResult(
193
+ is_open_access=True,
194
+ status=OAStatus.GREEN,
195
+ oa_url=f"https://arxiv.org/pdf/{arxiv_id}.pdf",
196
+ source="arxiv",
197
+ confidence=1.0,
198
+ )
199
+
200
+ # PMC - always open access (GREEN)
201
+ if pmcid and pmcid.upper().startswith("PMC"):
202
+ pmc_num = pmcid[3:] if pmcid.upper().startswith("PMC") else pmcid
203
+ return OAResult(
204
+ is_open_access=True,
205
+ status=OAStatus.GREEN,
206
+ oa_url=f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_num}/pdf/",
207
+ source="pmc",
208
+ confidence=1.0,
209
+ )
210
+
211
+ # Known OA source
212
+ if source and is_open_access_source(source):
213
+ return OAResult(
214
+ is_open_access=True,
215
+ status=OAStatus.GREEN if source.lower() in ["arxiv", "pmc", "biorxiv", "medrxiv"] else OAStatus.GOLD,
216
+ source=f"source_{source}",
217
+ confidence=0.95,
218
+ )
219
+
220
+ # Known OA journal
221
+ if journal and is_open_access_journal(journal):
222
+ return OAResult(
223
+ is_open_access=True,
224
+ status=OAStatus.GOLD,
225
+ source="oa_journal",
226
+ confidence=0.85,
227
+ )
228
+
229
+ # If we have a DOI but no other OA indicators, it's likely paywalled
230
+ if doi and not arxiv_id and not pmcid:
231
+ return OAResult(
232
+ is_open_access=False,
233
+ status=OAStatus.UNKNOWN, # Could be hybrid OA, need Unpaywall to confirm
234
+ source="no_oa_indicators",
235
+ confidence=0.6, # Low confidence - could be hybrid OA
236
+ )
237
+
238
+ # Unknown
239
+ return OAResult(
240
+ is_open_access=False,
241
+ status=OAStatus.UNKNOWN,
242
+ source="unknown",
243
+ confidence=0.3,
244
+ )
245
+
246
+
247
+ async def check_oa_status_unpaywall(
248
+ doi: str,
249
+ email: str = None,
250
+ timeout: float = 10.0,
251
+ ) -> OAResult:
252
+ """
253
+ Check open access status via Unpaywall API.
254
+
255
+ Unpaywall is the authoritative source for OA status detection.
256
+ Rate limit: 100,000 requests/day with email.
257
+
258
+ Args:
259
+ doi: Paper DOI (required)
260
+ email: Email for Unpaywall API (required for polite access)
261
+ timeout: Request timeout in seconds
262
+
263
+ Returns:
264
+ OAResult with comprehensive OA information
265
+ """
266
+ if not doi:
267
+ return OAResult(
268
+ is_open_access=False,
269
+ status=OAStatus.UNKNOWN,
270
+ source="no_doi",
271
+ )
272
+
273
+ # Use config email if not provided
274
+ if email is None:
275
+ email = _get_unpaywall_email()
276
+
277
+ # Clean DOI
278
+ doi = doi.strip()
279
+ if doi.lower().startswith("https://doi.org/"):
280
+ doi = doi[16:]
281
+ elif doi.lower().startswith("doi:"):
282
+ doi = doi[4:]
283
+
284
+ url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
285
+
286
+ try:
287
+ async with aiohttp.ClientSession() as session:
288
+ async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
289
+ if resp.status == 404:
290
+ return OAResult(
291
+ is_open_access=False,
292
+ status=OAStatus.UNKNOWN,
293
+ source="unpaywall_not_found",
294
+ confidence=0.5,
295
+ )
296
+
297
+ if resp.status != 200:
298
+ logger.warning(f"Unpaywall API error: {resp.status}")
299
+ return OAResult(
300
+ is_open_access=False,
301
+ status=OAStatus.UNKNOWN,
302
+ source="unpaywall_error",
303
+ confidence=0.0,
304
+ )
305
+
306
+ data = await resp.json()
307
+
308
+ is_oa = data.get("is_oa", False)
309
+ oa_status_str = data.get("oa_status", "closed")
310
+
311
+ # Map Unpaywall status to our enum
312
+ status_map = {
313
+ "gold": OAStatus.GOLD,
314
+ "green": OAStatus.GREEN,
315
+ "hybrid": OAStatus.HYBRID,
316
+ "bronze": OAStatus.BRONZE,
317
+ "closed": OAStatus.CLOSED,
318
+ }
319
+ status = status_map.get(oa_status_str, OAStatus.UNKNOWN)
320
+
321
+ # Get best OA location
322
+ oa_url = None
323
+ license_str = None
324
+ best_oa = data.get("best_oa_location")
325
+ if best_oa:
326
+ oa_url = best_oa.get("url_for_pdf") or best_oa.get("url")
327
+ license_str = best_oa.get("license")
328
+
329
+ return OAResult(
330
+ is_open_access=is_oa,
331
+ status=status,
332
+ oa_url=oa_url,
333
+ source="unpaywall",
334
+ license=license_str,
335
+ confidence=1.0,
336
+ )
337
+
338
+ except asyncio.TimeoutError:
339
+ logger.warning(f"Unpaywall timeout for DOI: {doi}")
340
+ return OAResult(
341
+ is_open_access=False,
342
+ status=OAStatus.UNKNOWN,
343
+ source="unpaywall_timeout",
344
+ confidence=0.0,
345
+ )
346
+ except Exception as e:
347
+ logger.error(f"Unpaywall API error: {e}")
348
+ return OAResult(
349
+ is_open_access=False,
350
+ status=OAStatus.UNKNOWN,
351
+ source="unpaywall_exception",
352
+ confidence=0.0,
353
+ )
354
+
355
+
356
+ async def check_oa_status_async(
357
+ doi: Optional[str] = None,
358
+ arxiv_id: Optional[str] = None,
359
+ pmcid: Optional[str] = None,
360
+ source: Optional[str] = None,
361
+ journal: Optional[str] = None,
362
+ is_open_access_flag: Optional[bool] = None,
363
+ use_unpaywall: bool = True,
364
+ unpaywall_email: str = None,
365
+ ) -> OAResult:
366
+ """
367
+ Comprehensive open access detection.
368
+
369
+ First tries fast local detection, then falls back to Unpaywall API
370
+ if the status is uncertain.
371
+
372
+ Args:
373
+ doi: Paper DOI
374
+ arxiv_id: arXiv identifier
375
+ pmcid: PubMed Central ID
376
+ source: Source database
377
+ journal: Journal name
378
+ is_open_access_flag: Pre-existing OA flag
379
+ use_unpaywall: Whether to query Unpaywall for uncertain cases
380
+ unpaywall_email: Email for Unpaywall API
381
+
382
+ Returns:
383
+ OAResult with best available OA information
384
+ """
385
+ # Try fast local detection first
386
+ local_result = detect_oa_from_identifiers(
387
+ doi=doi,
388
+ arxiv_id=arxiv_id,
389
+ pmcid=pmcid,
390
+ source=source,
391
+ journal=journal,
392
+ is_open_access_flag=is_open_access_flag,
393
+ )
394
+
395
+ # If we're confident, return immediately
396
+ if local_result.confidence >= 0.9:
397
+ return local_result
398
+
399
+ # If we have a DOI and local detection was uncertain, try Unpaywall
400
+ if use_unpaywall and doi and local_result.confidence < 0.7:
401
+ unpaywall_result = await check_oa_status_unpaywall(
402
+ doi=doi,
403
+ email=unpaywall_email,
404
+ )
405
+
406
+ # Unpaywall is authoritative if it returns a result
407
+ if unpaywall_result.confidence > local_result.confidence:
408
+ return unpaywall_result
409
+
410
+ return local_result
411
+
412
+
413
+ def check_oa_status(
414
+ doi: Optional[str] = None,
415
+ arxiv_id: Optional[str] = None,
416
+ pmcid: Optional[str] = None,
417
+ source: Optional[str] = None,
418
+ journal: Optional[str] = None,
419
+ is_open_access_flag: Optional[bool] = None,
420
+ use_unpaywall: bool = False, # Default to sync-safe behavior
421
+ ) -> OAResult:
422
+ """
423
+ Synchronous wrapper for OA detection.
424
+
425
+ By default only uses local detection (no API calls).
426
+ Set use_unpaywall=True to use Unpaywall API (requires event loop).
427
+ """
428
+ if use_unpaywall:
429
+ try:
430
+ loop = asyncio.get_event_loop()
431
+ except RuntimeError:
432
+ loop = asyncio.new_event_loop()
433
+ asyncio.set_event_loop(loop)
434
+
435
+ return loop.run_until_complete(
436
+ check_oa_status_async(
437
+ doi=doi,
438
+ arxiv_id=arxiv_id,
439
+ pmcid=pmcid,
440
+ source=source,
441
+ journal=journal,
442
+ is_open_access_flag=is_open_access_flag,
443
+ use_unpaywall=True,
444
+ )
445
+ )
446
+
447
+ return detect_oa_from_identifiers(
448
+ doi=doi,
449
+ arxiv_id=arxiv_id,
450
+ pmcid=pmcid,
451
+ source=source,
452
+ journal=journal,
453
+ is_open_access_flag=is_open_access_flag,
454
+ )
455
+
456
+
457
+ # EOF
@@ -240,6 +240,13 @@ class ScholarEngine:
240
240
  if name in engine_classes:
241
241
  if name == "url_doi_engine":
242
242
  self._engine_instances[name] = engine_classes[name]()
243
+ elif name == "CrossRefLocal":
244
+ # Get API URL from config (supports SCITEX_SCHOLAR_CROSSREF_API_URL env var)
245
+ api_url = self.config.resolve("crossref_api_url", "http://127.0.0.1:3333")
246
+ self._engine_instances[name] = engine_classes[name](
247
+ "research@example.com",
248
+ api_url=api_url
249
+ )
243
250
  else:
244
251
  self._engine_instances[name] = engine_classes[name](
245
252
  "research@example.com"
@@ -483,7 +490,8 @@ class ScholarEngine:
483
490
  """Merge two metadata structures with engine priority."""
484
491
  merged = base.copy()
485
492
  engine_priority = {
486
- "URL": 5,
493
+ "URL": 6,
494
+ "CrossRefLocal": 5,
487
495
  "CrossRef": 4,
488
496
  "OpenAlex": 3,
489
497
  "Semantic_Scholar": 2,
@@ -23,7 +23,14 @@ logger = logging.getLogger(__name__)
23
23
 
24
24
 
25
25
  class CrossRefLocalEngine(BaseDOIEngine):
26
- """CrossRef Local Engine using local Django API"""
26
+ """CrossRef Local Engine using local Django API or external public API
27
+
28
+ Supports both:
29
+ - Internal API: http://crossref:3333 (Docker network)
30
+ - External API: https://scitex.ai/scholar/api/crossref (Public internet)
31
+
32
+ Automatically detects API format and adjusts endpoints accordingly.
33
+ """
27
34
 
28
35
  def __init__(
29
36
  self,
@@ -33,6 +40,9 @@ class CrossRefLocalEngine(BaseDOIEngine):
33
40
  super().__init__(email)
34
41
  self.api_url = api_url.rstrip("/")
35
42
 
43
+ # Detect API type: external (public) vs internal (Docker/local)
44
+ self._is_external_api = "/api/crossref" in self.api_url or "scitex.ai" in self.api_url
45
+
36
46
  @property
37
47
  def name(self) -> str:
38
48
  return "CrossRefLocal"
@@ -41,6 +51,26 @@ class CrossRefLocalEngine(BaseDOIEngine):
41
51
  def rate_limit_delay(self) -> float:
42
52
  return 0.01
43
53
 
54
+ def _build_endpoint_url(self, endpoint: str) -> str:
55
+ """Build the correct endpoint URL based on API type
56
+
57
+ Args:
58
+ endpoint: Endpoint name (e.g., 'search', 'health', 'stats')
59
+
60
+ Returns:
61
+ Full URL for the endpoint
62
+
63
+ Examples:
64
+ Internal: http://crossref:3333/api/search/
65
+ External: https://scitex.ai/scholar/api/crossref/search/
66
+ """
67
+ if self._is_external_api:
68
+ # External API: base URL already includes /scholar/api/crossref
69
+ return f"{self.api_url}/{endpoint}/"
70
+ else:
71
+ # Internal API: need to add /api/ prefix
72
+ return f"{self.api_url}/api/{endpoint}/"
73
+
44
74
  def search(
45
75
  self,
46
76
  title: Optional[str] = None,
@@ -80,8 +110,8 @@ class CrossRefLocalEngine(BaseDOIEngine):
80
110
  def _make_search_request(
81
111
  self, params: dict, return_as: str
82
112
  ) -> Optional[Dict]:
83
- """Make search request to local API"""
84
- url = f"{self.api_url}/api/search/"
113
+ """Make search request to local or external API"""
114
+ url = self._build_endpoint_url("search")
85
115
 
86
116
  try:
87
117
  assert return_as in [
@@ -129,7 +159,7 @@ class CrossRefLocalEngine(BaseDOIEngine):
129
159
  doi = doi.replace("https://doi.org/", "").replace(
130
160
  "http://doi.org/", ""
131
161
  )
132
- url = f"{self.api_url}/api/search/"
162
+ url = self._build_endpoint_url("search")
133
163
  params = {"doi": doi}
134
164
 
135
165
  try:
@@ -242,22 +272,53 @@ if __name__ == "__main__":
242
272
  from scitex.scholar.metadata_engines.individual import CrossRefLocalEngine
243
273
 
244
274
  TITLE = "deep learning"
245
- DOI = "10.1001/.387"
246
-
247
- engine = CrossRefLocalEngine("test@example.com")
248
- outputs = {}
249
-
250
- outputs["metadata_by_title_dict"] = engine.search(title=TITLE)
251
- outputs["metadata_by_doi_dict"] = engine.search(doi=DOI)
252
-
253
- for k, v in outputs.items():
254
- print("----------------------------------------")
255
- print(k)
256
- print("----------------------------------------")
257
- pprint(v)
258
- time.sleep(1)
259
-
260
-
261
- # python -m scitex.scholar.engines.individual.CrossRefLocalEngine
275
+ DOI = "10.1038/nature12373"
276
+
277
+ # Example 1: Internal API (Docker network or localhost)
278
+ print("\n" + "=" * 60)
279
+ print("INTERNAL API EXAMPLE")
280
+ print("=" * 60)
281
+ engine_internal = CrossRefLocalEngine(
282
+ "test@example.com",
283
+ api_url="http://crossref:3333"
284
+ )
285
+ print(f"API URL: {engine_internal.api_url}")
286
+ print(f"Is External: {engine_internal._is_external_api}")
287
+ print(f"Search endpoint: {engine_internal._build_endpoint_url('search')}")
288
+
289
+ # Example 2: External API (public internet)
290
+ print("\n" + "=" * 60)
291
+ print("EXTERNAL API EXAMPLE")
292
+ print("=" * 60)
293
+ engine_external = CrossRefLocalEngine(
294
+ "test@example.com",
295
+ api_url="https://scitex.ai/scholar/api/crossref"
296
+ )
297
+ print(f"API URL: {engine_external.api_url}")
298
+ print(f"Is External: {engine_external._is_external_api}")
299
+ print(f"Search endpoint: {engine_external._build_endpoint_url('search')}")
300
+
301
+ # Test search (use external for demo)
302
+ print("\n" + "=" * 60)
303
+ print("SEARCH TEST")
304
+ print("=" * 60)
305
+ result = engine_external.search(doi=DOI)
306
+ if result:
307
+ print(f"Title: {result.get('basic', {}).get('title')}")
308
+ print(f"DOI: {result.get('id', {}).get('doi')}")
309
+ print(f"Year: {result.get('basic', {}).get('year')}")
310
+ else:
311
+ print("No results found")
312
+
313
+
314
+ # Usage examples:
315
+ #
316
+ # Internal API (from NAS Docker):
317
+ # export SCITEX_SCHOLAR_CROSSREF_API_URL=http://crossref:3333
318
+ # python -m scitex.scholar.metadata_engines.individual.CrossRefLocalEngine
319
+ #
320
+ # External API (from anywhere):
321
+ # export SCITEX_SCHOLAR_CROSSREF_API_URL=https://scitex.ai/scholar/api/crossref
322
+ # python -m scitex.scholar.metadata_engines.individual.CrossRefLocalEngine
262
323
 
263
324
  # EOF