scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. scitex/__version__.py +1 -1
  2. scitex/browser/__init__.py +53 -0
  3. scitex/browser/auth/__init__.py +35 -0
  4. scitex/browser/auth/google.py +381 -0
  5. scitex/browser/collaboration/__init__.py +5 -0
  6. scitex/browser/debugging/__init__.py +56 -0
  7. scitex/browser/debugging/_failure_capture.py +372 -0
  8. scitex/browser/debugging/_sync_session.py +259 -0
  9. scitex/browser/debugging/_test_monitor.py +284 -0
  10. scitex/browser/debugging/_visual_cursor.py +432 -0
  11. scitex/scholar/citation_graph/README.md +117 -0
  12. scitex/scholar/citation_graph/__init__.py +29 -0
  13. scitex/scholar/citation_graph/builder.py +214 -0
  14. scitex/scholar/citation_graph/database.py +246 -0
  15. scitex/scholar/citation_graph/example.py +96 -0
  16. scitex/scholar/citation_graph/models.py +80 -0
  17. scitex/scholar/config/ScholarConfig.py +23 -3
  18. scitex/scholar/config/default.yaml +56 -0
  19. scitex/scholar/core/Paper.py +102 -0
  20. scitex/scholar/core/__init__.py +44 -0
  21. scitex/scholar/core/journal_normalizer.py +524 -0
  22. scitex/scholar/core/oa_cache.py +285 -0
  23. scitex/scholar/core/open_access.py +457 -0
  24. scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
  25. scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
  26. scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
  27. scitex/scholar/pdf_download/strategies/__init__.py +6 -0
  28. scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
  29. scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
  30. scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
  31. scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
  32. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
  33. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
  34. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
  35. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
  36. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,80 @@
1
+ """
2
+ Data models for citation graphs.
3
+ """
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import List, Dict, Optional
7
+
8
+
9
+ @dataclass
10
+ class PaperNode:
11
+ """Represents a paper in the citation network."""
12
+
13
+ doi: str
14
+ title: str = ""
15
+ year: int = 0
16
+ authors: List[str] = field(default_factory=list)
17
+ journal: str = ""
18
+ citation_count: int = 0
19
+ similarity_score: float = 0.0
20
+ metadata: Dict = field(default_factory=dict)
21
+
22
+ def to_dict(self) -> Dict:
23
+ """Convert to dictionary for JSON export."""
24
+ return {
25
+ "id": self.doi,
26
+ "title": self.title,
27
+ "year": self.year,
28
+ "authors": self.authors,
29
+ "journal": self.journal,
30
+ "citation_count": self.citation_count,
31
+ "similarity_score": self.similarity_score,
32
+ }
33
+
34
+
35
+ @dataclass
36
+ class CitationEdge:
37
+ """Represents a citation relationship between papers."""
38
+
39
+ source: str # DOI of citing paper
40
+ target: str # DOI of cited paper
41
+ edge_type: str = "cites" # 'cites', 'cited_by', 'similar'
42
+ weight: float = 1.0
43
+
44
+ def to_dict(self) -> Dict:
45
+ """Convert to dictionary for JSON export."""
46
+ return {
47
+ "source": self.source,
48
+ "target": self.target,
49
+ "type": self.edge_type,
50
+ "weight": self.weight,
51
+ }
52
+
53
+
54
+ @dataclass
55
+ class CitationGraph:
56
+ """Represents a complete citation network."""
57
+
58
+ seed_doi: str
59
+ nodes: List[PaperNode] = field(default_factory=list)
60
+ edges: List[CitationEdge] = field(default_factory=list)
61
+ metadata: Dict = field(default_factory=dict)
62
+
63
+ def to_dict(self) -> Dict:
64
+ """Convert to dictionary for JSON export."""
65
+ return {
66
+ "seed": self.seed_doi,
67
+ "nodes": [node.to_dict() for node in self.nodes],
68
+ "edges": [edge.to_dict() for edge in self.edges],
69
+ "metadata": self.metadata,
70
+ }
71
+
72
+ @property
73
+ def node_count(self) -> int:
74
+ """Number of nodes in graph."""
75
+ return len(self.nodes)
76
+
77
+ @property
78
+ def edge_count(self) -> int:
79
+ """Number of edges in graph."""
80
+ return len(self.edges)
@@ -29,8 +29,22 @@ logger = getLogger(__name__)
29
29
 
30
30
 
31
31
  class ScholarConfig:
32
- def __init__(self, config_path: Optional[Union[str, Path]] = None):
32
+ def __init__(
33
+ self,
34
+ config_path: Optional[Union[str, Path]] = None,
35
+ scholar_dir: Optional[Union[str, Path]] = None,
36
+ ):
37
+ """Initialize ScholarConfig.
38
+
39
+ Args:
40
+ config_path: Path to custom config YAML file
41
+ scholar_dir: Direct path to scholar directory (e.g., /data/users/alice/.scitex)
42
+ This bypasses SCITEX_DIR env var for thread-safe multi-user usage.
43
+ Use this in Django/multi-user environments to avoid race conditions.
44
+ """
33
45
  self.name = self.__class__.__name__
46
+ self._explicit_scholar_dir = scholar_dir # Store for thread-safe access
47
+
34
48
  if config_path and Path(config_path).exists():
35
49
  config_data = self.load_yaml(config_path)
36
50
  else:
@@ -114,8 +128,14 @@ class ScholarConfig:
114
128
 
115
129
  # Path Management ----------------------------------------
116
130
  def _setup_path_manager(self, scholar_dir=None):
117
- scholar_dir = self.cascade.resolve("scholar_dir", default="~/.scitex")
118
- base_path = Path(scholar_dir).expanduser() / "scholar"
131
+ # Priority: explicit parameter > env var > config > default
132
+ if self._explicit_scholar_dir:
133
+ # Use explicitly provided path (thread-safe for multi-user)
134
+ base_path = Path(self._explicit_scholar_dir).expanduser() / "scholar"
135
+ else:
136
+ # Fall back to cascade resolution (uses SCITEX_DIR env var)
137
+ scholar_dir = self.cascade.resolve("scholar_dir", default="~/.scitex")
138
+ base_path = Path(scholar_dir).expanduser() / "scholar"
119
139
  self.path_manager = PathManager(scholar_dir=base_path)
120
140
 
121
141
  @property
@@ -38,6 +38,61 @@ timeout: 60
38
38
  # Note: Download directory is fixed at {scholar_dir}/library/downloads/
39
39
  # Use get_library_downloads_dir() to access it
40
40
 
41
+ # Open Access & Paywall Options
42
+ # ----------------------------------------
43
+ # Prefer Open Access versions when available
44
+ prefer_open_access: ${SCITEX_SCHOLAR_PREFER_OPEN_ACCESS:-true}
45
+ # Allow attempting paywalled journal downloads (requires authentication)
46
+ # Only enable for local/personal use with valid institutional access
47
+ enable_paywall_access: ${SCITEX_SCHOLAR_ENABLE_PAYWALL_ACCESS:-false}
48
+ # Track paywall bypass attempts in metadata (for transparency)
49
+ track_paywall_attempts: ${SCITEX_SCHOLAR_TRACK_PAYWALL_ATTEMPTS:-true}
50
+
51
+ # Open Access Sources (repositories that are always free to download)
52
+ OPENACCESS_SOURCES:
53
+ - arxiv
54
+ - pmc
55
+ - pubmed_central
56
+ - biorxiv
57
+ - medrxiv
58
+ - chemrxiv
59
+ - doaj
60
+ - plos
61
+ - peerj
62
+ - frontiers
63
+ - mdpi
64
+ - hindawi
65
+ - nature_communications # Most articles are OA
66
+
67
+ # Open Access Journals (partial list, DOAJ has full list)
68
+ # These are journal name patterns (case-insensitive substring match)
69
+ OPENACCESS_JOURNALS:
70
+ - "plos one"
71
+ - "plos biology"
72
+ - "plos medicine"
73
+ - "plos computational biology"
74
+ - "plos genetics"
75
+ - "plos pathogens"
76
+ - "plos neglected tropical diseases"
77
+ - "scientific reports"
78
+ - "nature communications"
79
+ - "elife"
80
+ - "peerj"
81
+ - "frontiers in" # All Frontiers journals
82
+ - "bmc " # All BMC journals
83
+ - "journal of open source software"
84
+ - "f1000research"
85
+ - "gigascience"
86
+ - "cell reports"
87
+ - "science advances"
88
+ - "iscience"
89
+ - "heliyon"
90
+ - "cureus"
91
+ - "jmir" # Journal of Medical Internet Research
92
+
93
+ # Unpaywall API email (required for polite access)
94
+ unpaywall_email: ${SCITEX_SCHOLAR_UNPAYWALL_EMAIL:-"research@scitex.io"}
95
+
41
96
  # ----------------------------------------
42
97
  # cache
43
98
  # ----------------------------------------
@@ -54,6 +109,7 @@ timeout: 60
54
109
  # ----------------------------------------
55
110
  semantic_scholar_api_key: ${SCITEX_SCHOLAR_SEMANTIC_SCHOLAR_API_KEY:-null}
56
111
  crossref_api_key: ${SCITEX_SCHOLAR_CROSSREF_API_KEY:-null}
112
+ crossref_api_url: ${SCITEX_SCHOLAR_CROSSREF_API_URL:-"http://127.0.0.1:3333"}
57
113
  pubmed_api_key: ${SCITEX_SCHOLAR_PUBMED_API_KEY:-null}
58
114
  twocaptcha_api_key: ${SCITEX_SCHOLAR_2CAPTCHA_API_KEY:-null}
59
115
 
@@ -284,6 +284,37 @@ class PathMetadata(BaseModel):
284
284
  validate_assignment = True # Validate on attribute assignment too
285
285
 
286
286
 
287
+ class AccessMetadata(BaseModel):
288
+ """Open access and licensing metadata with source tracking.
289
+
290
+ Tracks whether a paper is open access and provides URLs for OA versions.
291
+ Also includes license information when available.
292
+ """
293
+
294
+ is_open_access: Optional[bool] = None
295
+ is_open_access_engines: List[str] = Field(default_factory=list)
296
+
297
+ oa_status: Optional[str] = None # gold, green, bronze, hybrid, closed
298
+ oa_status_engines: List[str] = Field(default_factory=list)
299
+
300
+ oa_url: Optional[str] = None # URL to open access version
301
+ oa_url_engines: List[str] = Field(default_factory=list)
302
+
303
+ license: Optional[str] = None # CC-BY, CC-BY-NC, etc.
304
+ license_engines: List[str] = Field(default_factory=list)
305
+
306
+ license_url: Optional[str] = None
307
+ license_url_engines: List[str] = Field(default_factory=list)
308
+
309
+ # For paywalled journals - opt-in for local/personal users
310
+ paywall_bypass_attempted: Optional[bool] = None
311
+ paywall_bypass_success: Optional[bool] = None
312
+
313
+ class Config:
314
+ populate_by_name = True
315
+ validate_assignment = True
316
+
317
+
287
318
  class SystemMetadata(BaseModel):
288
319
  """System tracking metadata (which engines were used to search)."""
289
320
 
@@ -313,6 +344,7 @@ class PaperMetadataStructure(BaseModel):
313
344
  )
314
345
  url: URLMetadata = Field(default_factory=URLMetadata)
315
346
  path: PathMetadata = Field(default_factory=PathMetadata)
347
+ access: AccessMetadata = Field(default_factory=AccessMetadata)
316
348
  system: SystemMetadata = Field(default_factory=SystemMetadata)
317
349
 
318
350
  class Config:
@@ -418,6 +450,7 @@ class PaperMetadataStructure(BaseModel):
418
450
  ),
419
451
  "url": self.url.model_dump(by_alias=True, **kwargs),
420
452
  "path": self.path.model_dump(by_alias=True, **kwargs),
453
+ "access": self.access.model_dump(by_alias=True, **kwargs),
421
454
  "system": self.system.model_dump(by_alias=True, **kwargs),
422
455
  }
423
456
 
@@ -489,6 +522,75 @@ class Paper(BaseModel):
489
522
  """
490
523
  return self.model_dump()
491
524
 
525
+ def detect_open_access(
526
+ self,
527
+ use_unpaywall: bool = False,
528
+ update_metadata: bool = True,
529
+ ) -> "OAResult":
530
+ """
531
+ Detect open access status for this paper.
532
+
533
+ Uses identifiers (DOI, arXiv ID, PMCID) and known OA sources
534
+ to determine if the paper is freely available.
535
+
536
+ Args:
537
+ use_unpaywall: If True, query Unpaywall API for uncertain cases
538
+ update_metadata: If True, update self.metadata.access with results
539
+
540
+ Returns:
541
+ OAResult with detection results
542
+ """
543
+ from .open_access import check_oa_status, OAResult
544
+
545
+ result = check_oa_status(
546
+ doi=self.metadata.id.doi,
547
+ arxiv_id=self.metadata.id.arxiv_id,
548
+ pmcid=None, # Not currently in IDMetadata
549
+ source=None, # Source tracking not in Paper
550
+ journal=self.metadata.publication.journal,
551
+ is_open_access_flag=self.metadata.access.is_open_access,
552
+ use_unpaywall=use_unpaywall,
553
+ )
554
+
555
+ if update_metadata:
556
+ self.metadata.access.is_open_access = result.is_open_access
557
+ self.metadata.access.is_open_access_engines.append(
558
+ f"detect_oa:{result.source}"
559
+ )
560
+ if result.status:
561
+ self.metadata.access.oa_status = result.status.value
562
+ self.metadata.access.oa_status_engines.append(
563
+ f"detect_oa:{result.source}"
564
+ )
565
+ if result.oa_url:
566
+ self.metadata.access.oa_url = result.oa_url
567
+ self.metadata.access.oa_url_engines.append(
568
+ f"detect_oa:{result.source}"
569
+ )
570
+ if result.license:
571
+ self.metadata.access.license = result.license
572
+ self.metadata.access.license_engines.append(
573
+ f"detect_oa:{result.source}"
574
+ )
575
+
576
+ return result
577
+
578
+ @property
579
+ def is_open_access(self) -> bool:
580
+ """Check if paper is open access (quick check without API calls)."""
581
+ if self.metadata.access.is_open_access is not None:
582
+ return self.metadata.access.is_open_access
583
+
584
+ # Quick detection from identifiers
585
+ from .open_access import detect_oa_from_identifiers
586
+
587
+ result = detect_oa_from_identifiers(
588
+ doi=self.metadata.id.doi,
589
+ arxiv_id=self.metadata.id.arxiv_id,
590
+ journal=self.metadata.publication.journal,
591
+ )
592
+ return result.is_open_access
593
+
492
594
 
493
595
  if __name__ == "__main__":
494
596
  import json
@@ -1,9 +1,53 @@
1
1
  from .Paper import Paper
2
2
  from .Papers import Papers
3
3
  from .Scholar import Scholar
4
+ from .open_access import (
5
+ OAStatus,
6
+ OAResult,
7
+ detect_oa_from_identifiers,
8
+ check_oa_status,
9
+ check_oa_status_async,
10
+ is_open_access_source,
11
+ is_open_access_journal,
12
+ is_arxiv_id,
13
+ )
14
+ from .oa_cache import (
15
+ OASourcesCache,
16
+ get_oa_cache,
17
+ is_oa_journal_cached,
18
+ refresh_oa_cache,
19
+ )
20
+ from .journal_normalizer import (
21
+ JournalNormalizer,
22
+ get_journal_normalizer,
23
+ normalize_journal_name,
24
+ get_journal_issn_l,
25
+ is_same_journal,
26
+ refresh_journal_cache,
27
+ )
4
28
 
5
29
  __all__ = [
6
30
  "Paper",
7
31
  "Papers",
8
32
  "Scholar",
33
+ "OAStatus",
34
+ "OAResult",
35
+ "detect_oa_from_identifiers",
36
+ "check_oa_status",
37
+ "check_oa_status_async",
38
+ "is_open_access_source",
39
+ "is_open_access_journal",
40
+ "is_arxiv_id",
41
+ # OA Cache
42
+ "OASourcesCache",
43
+ "get_oa_cache",
44
+ "is_oa_journal_cached",
45
+ "refresh_oa_cache",
46
+ # Journal Normalizer
47
+ "JournalNormalizer",
48
+ "get_journal_normalizer",
49
+ "normalize_journal_name",
50
+ "get_journal_issn_l",
51
+ "is_same_journal",
52
+ "refresh_journal_cache",
9
53
  ]