scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/auth/__init__.py +35 -0
- scitex/browser/auth/google.py +381 -0
- scitex/browser/collaboration/__init__.py +5 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/scholar/citation_graph/README.md +117 -0
- scitex/scholar/citation_graph/__init__.py +29 -0
- scitex/scholar/citation_graph/builder.py +214 -0
- scitex/scholar/citation_graph/database.py +246 -0
- scitex/scholar/citation_graph/example.py +96 -0
- scitex/scholar/citation_graph/models.py +80 -0
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +56 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
- scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data models for citation graphs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import List, Dict, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PaperNode:
|
|
11
|
+
"""Represents a paper in the citation network."""
|
|
12
|
+
|
|
13
|
+
doi: str
|
|
14
|
+
title: str = ""
|
|
15
|
+
year: int = 0
|
|
16
|
+
authors: List[str] = field(default_factory=list)
|
|
17
|
+
journal: str = ""
|
|
18
|
+
citation_count: int = 0
|
|
19
|
+
similarity_score: float = 0.0
|
|
20
|
+
metadata: Dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
def to_dict(self) -> Dict:
|
|
23
|
+
"""Convert to dictionary for JSON export."""
|
|
24
|
+
return {
|
|
25
|
+
"id": self.doi,
|
|
26
|
+
"title": self.title,
|
|
27
|
+
"year": self.year,
|
|
28
|
+
"authors": self.authors,
|
|
29
|
+
"journal": self.journal,
|
|
30
|
+
"citation_count": self.citation_count,
|
|
31
|
+
"similarity_score": self.similarity_score,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class CitationEdge:
|
|
37
|
+
"""Represents a citation relationship between papers."""
|
|
38
|
+
|
|
39
|
+
source: str # DOI of citing paper
|
|
40
|
+
target: str # DOI of cited paper
|
|
41
|
+
edge_type: str = "cites" # 'cites', 'cited_by', 'similar'
|
|
42
|
+
weight: float = 1.0
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> Dict:
|
|
45
|
+
"""Convert to dictionary for JSON export."""
|
|
46
|
+
return {
|
|
47
|
+
"source": self.source,
|
|
48
|
+
"target": self.target,
|
|
49
|
+
"type": self.edge_type,
|
|
50
|
+
"weight": self.weight,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class CitationGraph:
|
|
56
|
+
"""Represents a complete citation network."""
|
|
57
|
+
|
|
58
|
+
seed_doi: str
|
|
59
|
+
nodes: List[PaperNode] = field(default_factory=list)
|
|
60
|
+
edges: List[CitationEdge] = field(default_factory=list)
|
|
61
|
+
metadata: Dict = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> Dict:
|
|
64
|
+
"""Convert to dictionary for JSON export."""
|
|
65
|
+
return {
|
|
66
|
+
"seed": self.seed_doi,
|
|
67
|
+
"nodes": [node.to_dict() for node in self.nodes],
|
|
68
|
+
"edges": [edge.to_dict() for edge in self.edges],
|
|
69
|
+
"metadata": self.metadata,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def node_count(self) -> int:
|
|
74
|
+
"""Number of nodes in graph."""
|
|
75
|
+
return len(self.nodes)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def edge_count(self) -> int:
|
|
79
|
+
"""Number of edges in graph."""
|
|
80
|
+
return len(self.edges)
|
|
@@ -29,8 +29,22 @@ logger = getLogger(__name__)
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ScholarConfig:
|
|
32
|
-
def __init__(
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
config_path: Optional[Union[str, Path]] = None,
|
|
35
|
+
scholar_dir: Optional[Union[str, Path]] = None,
|
|
36
|
+
):
|
|
37
|
+
"""Initialize ScholarConfig.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
config_path: Path to custom config YAML file
|
|
41
|
+
scholar_dir: Direct path to scholar directory (e.g., /data/users/alice/.scitex)
|
|
42
|
+
This bypasses SCITEX_DIR env var for thread-safe multi-user usage.
|
|
43
|
+
Use this in Django/multi-user environments to avoid race conditions.
|
|
44
|
+
"""
|
|
33
45
|
self.name = self.__class__.__name__
|
|
46
|
+
self._explicit_scholar_dir = scholar_dir # Store for thread-safe access
|
|
47
|
+
|
|
34
48
|
if config_path and Path(config_path).exists():
|
|
35
49
|
config_data = self.load_yaml(config_path)
|
|
36
50
|
else:
|
|
@@ -114,8 +128,14 @@ class ScholarConfig:
|
|
|
114
128
|
|
|
115
129
|
# Path Management ----------------------------------------
|
|
116
130
|
def _setup_path_manager(self, scholar_dir=None):
|
|
117
|
-
|
|
118
|
-
|
|
131
|
+
# Priority: explicit parameter > env var > config > default
|
|
132
|
+
if self._explicit_scholar_dir:
|
|
133
|
+
# Use explicitly provided path (thread-safe for multi-user)
|
|
134
|
+
base_path = Path(self._explicit_scholar_dir).expanduser() / "scholar"
|
|
135
|
+
else:
|
|
136
|
+
# Fall back to cascade resolution (uses SCITEX_DIR env var)
|
|
137
|
+
scholar_dir = self.cascade.resolve("scholar_dir", default="~/.scitex")
|
|
138
|
+
base_path = Path(scholar_dir).expanduser() / "scholar"
|
|
119
139
|
self.path_manager = PathManager(scholar_dir=base_path)
|
|
120
140
|
|
|
121
141
|
@property
|
|
@@ -38,6 +38,61 @@ timeout: 60
|
|
|
38
38
|
# Note: Download directory is fixed at {scholar_dir}/library/downloads/
|
|
39
39
|
# Use get_library_downloads_dir() to access it
|
|
40
40
|
|
|
41
|
+
# Open Access & Paywall Options
|
|
42
|
+
# ----------------------------------------
|
|
43
|
+
# Prefer Open Access versions when available
|
|
44
|
+
prefer_open_access: ${SCITEX_SCHOLAR_PREFER_OPEN_ACCESS:-true}
|
|
45
|
+
# Allow attempting paywalled journal downloads (requires authentication)
|
|
46
|
+
# Only enable for local/personal use with valid institutional access
|
|
47
|
+
enable_paywall_access: ${SCITEX_SCHOLAR_ENABLE_PAYWALL_ACCESS:-false}
|
|
48
|
+
# Track paywall bypass attempts in metadata (for transparency)
|
|
49
|
+
track_paywall_attempts: ${SCITEX_SCHOLAR_TRACK_PAYWALL_ATTEMPTS:-true}
|
|
50
|
+
|
|
51
|
+
# Open Access Sources (repositories that are always free to download)
|
|
52
|
+
OPENACCESS_SOURCES:
|
|
53
|
+
- arxiv
|
|
54
|
+
- pmc
|
|
55
|
+
- pubmed_central
|
|
56
|
+
- biorxiv
|
|
57
|
+
- medrxiv
|
|
58
|
+
- chemrxiv
|
|
59
|
+
- doaj
|
|
60
|
+
- plos
|
|
61
|
+
- peerj
|
|
62
|
+
- frontiers
|
|
63
|
+
- mdpi
|
|
64
|
+
- hindawi
|
|
65
|
+
- nature_communications # Most articles are OA
|
|
66
|
+
|
|
67
|
+
# Open Access Journals (partial list, DOAJ has full list)
|
|
68
|
+
# These are journal name patterns (case-insensitive substring match)
|
|
69
|
+
OPENACCESS_JOURNALS:
|
|
70
|
+
- "plos one"
|
|
71
|
+
- "plos biology"
|
|
72
|
+
- "plos medicine"
|
|
73
|
+
- "plos computational biology"
|
|
74
|
+
- "plos genetics"
|
|
75
|
+
- "plos pathogens"
|
|
76
|
+
- "plos neglected tropical diseases"
|
|
77
|
+
- "scientific reports"
|
|
78
|
+
- "nature communications"
|
|
79
|
+
- "elife"
|
|
80
|
+
- "peerj"
|
|
81
|
+
- "frontiers in" # All Frontiers journals
|
|
82
|
+
- "bmc " # All BMC journals
|
|
83
|
+
- "journal of open source software"
|
|
84
|
+
- "f1000research"
|
|
85
|
+
- "gigascience"
|
|
86
|
+
- "cell reports"
|
|
87
|
+
- "science advances"
|
|
88
|
+
- "iscience"
|
|
89
|
+
- "heliyon"
|
|
90
|
+
- "cureus"
|
|
91
|
+
- "jmir" # Journal of Medical Internet Research
|
|
92
|
+
|
|
93
|
+
# Unpaywall API email (required for polite access)
|
|
94
|
+
unpaywall_email: ${SCITEX_SCHOLAR_UNPAYWALL_EMAIL:-"research@scitex.io"}
|
|
95
|
+
|
|
41
96
|
# ----------------------------------------
|
|
42
97
|
# cache
|
|
43
98
|
# ----------------------------------------
|
|
@@ -54,6 +109,7 @@ timeout: 60
|
|
|
54
109
|
# ----------------------------------------
|
|
55
110
|
semantic_scholar_api_key: ${SCITEX_SCHOLAR_SEMANTIC_SCHOLAR_API_KEY:-null}
|
|
56
111
|
crossref_api_key: ${SCITEX_SCHOLAR_CROSSREF_API_KEY:-null}
|
|
112
|
+
crossref_api_url: ${SCITEX_SCHOLAR_CROSSREF_API_URL:-"http://127.0.0.1:3333"}
|
|
57
113
|
pubmed_api_key: ${SCITEX_SCHOLAR_PUBMED_API_KEY:-null}
|
|
58
114
|
twocaptcha_api_key: ${SCITEX_SCHOLAR_2CAPTCHA_API_KEY:-null}
|
|
59
115
|
|
scitex/scholar/core/Paper.py
CHANGED
|
@@ -284,6 +284,37 @@ class PathMetadata(BaseModel):
|
|
|
284
284
|
validate_assignment = True # Validate on attribute assignment too
|
|
285
285
|
|
|
286
286
|
|
|
287
|
+
class AccessMetadata(BaseModel):
|
|
288
|
+
"""Open access and licensing metadata with source tracking.
|
|
289
|
+
|
|
290
|
+
Tracks whether a paper is open access and provides URLs for OA versions.
|
|
291
|
+
Also includes license information when available.
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
is_open_access: Optional[bool] = None
|
|
295
|
+
is_open_access_engines: List[str] = Field(default_factory=list)
|
|
296
|
+
|
|
297
|
+
oa_status: Optional[str] = None # gold, green, bronze, hybrid, closed
|
|
298
|
+
oa_status_engines: List[str] = Field(default_factory=list)
|
|
299
|
+
|
|
300
|
+
oa_url: Optional[str] = None # URL to open access version
|
|
301
|
+
oa_url_engines: List[str] = Field(default_factory=list)
|
|
302
|
+
|
|
303
|
+
license: Optional[str] = None # CC-BY, CC-BY-NC, etc.
|
|
304
|
+
license_engines: List[str] = Field(default_factory=list)
|
|
305
|
+
|
|
306
|
+
license_url: Optional[str] = None
|
|
307
|
+
license_url_engines: List[str] = Field(default_factory=list)
|
|
308
|
+
|
|
309
|
+
# For paywalled journals - opt-in for local/personal users
|
|
310
|
+
paywall_bypass_attempted: Optional[bool] = None
|
|
311
|
+
paywall_bypass_success: Optional[bool] = None
|
|
312
|
+
|
|
313
|
+
class Config:
|
|
314
|
+
populate_by_name = True
|
|
315
|
+
validate_assignment = True
|
|
316
|
+
|
|
317
|
+
|
|
287
318
|
class SystemMetadata(BaseModel):
|
|
288
319
|
"""System tracking metadata (which engines were used to search)."""
|
|
289
320
|
|
|
@@ -313,6 +344,7 @@ class PaperMetadataStructure(BaseModel):
|
|
|
313
344
|
)
|
|
314
345
|
url: URLMetadata = Field(default_factory=URLMetadata)
|
|
315
346
|
path: PathMetadata = Field(default_factory=PathMetadata)
|
|
347
|
+
access: AccessMetadata = Field(default_factory=AccessMetadata)
|
|
316
348
|
system: SystemMetadata = Field(default_factory=SystemMetadata)
|
|
317
349
|
|
|
318
350
|
class Config:
|
|
@@ -418,6 +450,7 @@ class PaperMetadataStructure(BaseModel):
|
|
|
418
450
|
),
|
|
419
451
|
"url": self.url.model_dump(by_alias=True, **kwargs),
|
|
420
452
|
"path": self.path.model_dump(by_alias=True, **kwargs),
|
|
453
|
+
"access": self.access.model_dump(by_alias=True, **kwargs),
|
|
421
454
|
"system": self.system.model_dump(by_alias=True, **kwargs),
|
|
422
455
|
}
|
|
423
456
|
|
|
@@ -489,6 +522,75 @@ class Paper(BaseModel):
|
|
|
489
522
|
"""
|
|
490
523
|
return self.model_dump()
|
|
491
524
|
|
|
525
|
+
def detect_open_access(
|
|
526
|
+
self,
|
|
527
|
+
use_unpaywall: bool = False,
|
|
528
|
+
update_metadata: bool = True,
|
|
529
|
+
) -> "OAResult":
|
|
530
|
+
"""
|
|
531
|
+
Detect open access status for this paper.
|
|
532
|
+
|
|
533
|
+
Uses identifiers (DOI, arXiv ID, PMCID) and known OA sources
|
|
534
|
+
to determine if the paper is freely available.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
use_unpaywall: If True, query Unpaywall API for uncertain cases
|
|
538
|
+
update_metadata: If True, update self.metadata.access with results
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
OAResult with detection results
|
|
542
|
+
"""
|
|
543
|
+
from .open_access import check_oa_status, OAResult
|
|
544
|
+
|
|
545
|
+
result = check_oa_status(
|
|
546
|
+
doi=self.metadata.id.doi,
|
|
547
|
+
arxiv_id=self.metadata.id.arxiv_id,
|
|
548
|
+
pmcid=None, # Not currently in IDMetadata
|
|
549
|
+
source=None, # Source tracking not in Paper
|
|
550
|
+
journal=self.metadata.publication.journal,
|
|
551
|
+
is_open_access_flag=self.metadata.access.is_open_access,
|
|
552
|
+
use_unpaywall=use_unpaywall,
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
if update_metadata:
|
|
556
|
+
self.metadata.access.is_open_access = result.is_open_access
|
|
557
|
+
self.metadata.access.is_open_access_engines.append(
|
|
558
|
+
f"detect_oa:{result.source}"
|
|
559
|
+
)
|
|
560
|
+
if result.status:
|
|
561
|
+
self.metadata.access.oa_status = result.status.value
|
|
562
|
+
self.metadata.access.oa_status_engines.append(
|
|
563
|
+
f"detect_oa:{result.source}"
|
|
564
|
+
)
|
|
565
|
+
if result.oa_url:
|
|
566
|
+
self.metadata.access.oa_url = result.oa_url
|
|
567
|
+
self.metadata.access.oa_url_engines.append(
|
|
568
|
+
f"detect_oa:{result.source}"
|
|
569
|
+
)
|
|
570
|
+
if result.license:
|
|
571
|
+
self.metadata.access.license = result.license
|
|
572
|
+
self.metadata.access.license_engines.append(
|
|
573
|
+
f"detect_oa:{result.source}"
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return result
|
|
577
|
+
|
|
578
|
+
@property
|
|
579
|
+
def is_open_access(self) -> bool:
|
|
580
|
+
"""Check if paper is open access (quick check without API calls)."""
|
|
581
|
+
if self.metadata.access.is_open_access is not None:
|
|
582
|
+
return self.metadata.access.is_open_access
|
|
583
|
+
|
|
584
|
+
# Quick detection from identifiers
|
|
585
|
+
from .open_access import detect_oa_from_identifiers
|
|
586
|
+
|
|
587
|
+
result = detect_oa_from_identifiers(
|
|
588
|
+
doi=self.metadata.id.doi,
|
|
589
|
+
arxiv_id=self.metadata.id.arxiv_id,
|
|
590
|
+
journal=self.metadata.publication.journal,
|
|
591
|
+
)
|
|
592
|
+
return result.is_open_access
|
|
593
|
+
|
|
492
594
|
|
|
493
595
|
if __name__ == "__main__":
|
|
494
596
|
import json
|
scitex/scholar/core/__init__.py
CHANGED
|
@@ -1,9 +1,53 @@
|
|
|
1
1
|
from .Paper import Paper
|
|
2
2
|
from .Papers import Papers
|
|
3
3
|
from .Scholar import Scholar
|
|
4
|
+
from .open_access import (
|
|
5
|
+
OAStatus,
|
|
6
|
+
OAResult,
|
|
7
|
+
detect_oa_from_identifiers,
|
|
8
|
+
check_oa_status,
|
|
9
|
+
check_oa_status_async,
|
|
10
|
+
is_open_access_source,
|
|
11
|
+
is_open_access_journal,
|
|
12
|
+
is_arxiv_id,
|
|
13
|
+
)
|
|
14
|
+
from .oa_cache import (
|
|
15
|
+
OASourcesCache,
|
|
16
|
+
get_oa_cache,
|
|
17
|
+
is_oa_journal_cached,
|
|
18
|
+
refresh_oa_cache,
|
|
19
|
+
)
|
|
20
|
+
from .journal_normalizer import (
|
|
21
|
+
JournalNormalizer,
|
|
22
|
+
get_journal_normalizer,
|
|
23
|
+
normalize_journal_name,
|
|
24
|
+
get_journal_issn_l,
|
|
25
|
+
is_same_journal,
|
|
26
|
+
refresh_journal_cache,
|
|
27
|
+
)
|
|
4
28
|
|
|
5
29
|
__all__ = [
|
|
6
30
|
"Paper",
|
|
7
31
|
"Papers",
|
|
8
32
|
"Scholar",
|
|
33
|
+
"OAStatus",
|
|
34
|
+
"OAResult",
|
|
35
|
+
"detect_oa_from_identifiers",
|
|
36
|
+
"check_oa_status",
|
|
37
|
+
"check_oa_status_async",
|
|
38
|
+
"is_open_access_source",
|
|
39
|
+
"is_open_access_journal",
|
|
40
|
+
"is_arxiv_id",
|
|
41
|
+
# OA Cache
|
|
42
|
+
"OASourcesCache",
|
|
43
|
+
"get_oa_cache",
|
|
44
|
+
"is_oa_journal_cached",
|
|
45
|
+
"refresh_oa_cache",
|
|
46
|
+
# Journal Normalizer
|
|
47
|
+
"JournalNormalizer",
|
|
48
|
+
"get_journal_normalizer",
|
|
49
|
+
"normalize_journal_name",
|
|
50
|
+
"get_journal_issn_l",
|
|
51
|
+
"is_same_journal",
|
|
52
|
+
"refresh_journal_cache",
|
|
9
53
|
]
|