scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/auth/__init__.py +35 -0
- scitex/browser/auth/google.py +381 -0
- scitex/browser/collaboration/__init__.py +5 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/scholar/citation_graph/README.md +117 -0
- scitex/scholar/citation_graph/__init__.py +29 -0
- scitex/scholar/citation_graph/builder.py +214 -0
- scitex/scholar/citation_graph/database.py +246 -0
- scitex/scholar/citation_graph/example.py +96 -0
- scitex/scholar/citation_graph/models.py +80 -0
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +56 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
- scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Citation Graph Module
|
|
2
|
+
|
|
3
|
+
Build and analyze citation networks for academic papers using CrossRef data.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Citation extraction**: Forward and reverse citation lookups
|
|
8
|
+
- **Similarity metrics**: Co-citation and bibliographic coupling analysis
|
|
9
|
+
- **Network building**: Construct graphs of related papers
|
|
10
|
+
- **Export formats**: JSON for D3.js, vis.js, Cytoscape
|
|
11
|
+
|
|
12
|
+
## Quick Start
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from scitex.scholar.citation_graph import CitationGraphBuilder
|
|
16
|
+
|
|
17
|
+
# Initialize with CrossRef database
|
|
18
|
+
builder = CitationGraphBuilder("/path/to/crossref.db")
|
|
19
|
+
|
|
20
|
+
# Build citation network for a paper
|
|
21
|
+
graph = builder.build("10.1038/s41586-020-2008-3", top_n=20)
|
|
22
|
+
|
|
23
|
+
# Export for visualization
|
|
24
|
+
builder.export_json(graph, "network.json")
|
|
25
|
+
|
|
26
|
+
# Get paper summary
|
|
27
|
+
summary = builder.get_paper_summary("10.1038/s41586-020-2008-3")
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Architecture
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
citation_graph/
|
|
34
|
+
├── __init__.py # Package exports
|
|
35
|
+
├── builder.py # CitationGraphBuilder (main interface)
|
|
36
|
+
├── database.py # Database queries and connection management
|
|
37
|
+
├── models.py # Data models (PaperNode, CitationEdge, CitationGraph)
|
|
38
|
+
└── README.md # This file
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Similarity Metrics
|
|
42
|
+
|
|
43
|
+
### 1. Co-citation
|
|
44
|
+
Papers are related if they are frequently cited together.
|
|
45
|
+
- **Algorithm**: Find papers that appear together in reference lists
|
|
46
|
+
- **Weight**: 2.0 (default)
|
|
47
|
+
- **Use case**: Find foundational/seminal works in the same field
|
|
48
|
+
|
|
49
|
+
### 2. Bibliographic Coupling
|
|
50
|
+
Papers are related if they cite similar references.
|
|
51
|
+
- **Algorithm**: Count shared references between papers
|
|
52
|
+
- **Weight**: 2.0 (default)
|
|
53
|
+
- **Use case**: Find papers addressing similar problems/methods
|
|
54
|
+
|
|
55
|
+
### 3. Direct Citations
|
|
56
|
+
Papers directly citing or cited by the seed paper.
|
|
57
|
+
- **Weight**: 1.0 (default)
|
|
58
|
+
- **Use case**: Find immediately related work
|
|
59
|
+
|
|
60
|
+
## Performance
|
|
61
|
+
|
|
62
|
+
Based on experiments with 47M+ citations:
|
|
63
|
+
|
|
64
|
+
| Operation | Time | Status |
|
|
65
|
+
|-----------|------|--------|
|
|
66
|
+
| Forward citations | 0.1ms | ⚡ Excellent |
|
|
67
|
+
| Reverse citations | 3.3s | ✓ Good |
|
|
68
|
+
| Co-citation | 3.2s | ✓ Good |
|
|
69
|
+
| Bibliographic coupling | 25s | ⚠️ Needs optimization |
|
|
70
|
+
| **Full network build** | **~30s** | ✓ Acceptable |
|
|
71
|
+
|
|
72
|
+
## Database Schema
|
|
73
|
+
|
|
74
|
+
Requires CrossRef database with:
|
|
75
|
+
- `works` table: Paper metadata
|
|
76
|
+
- `citations` table: Citation relationships (citing_doi, cited_doi, citing_year)
|
|
77
|
+
|
|
78
|
+
## Example Output
|
|
79
|
+
|
|
80
|
+
```json
|
|
81
|
+
{
|
|
82
|
+
"seed": "10.1038/s41586-020-2008-3",
|
|
83
|
+
"nodes": [
|
|
84
|
+
{
|
|
85
|
+
"id": "10.1038/s41586-020-2008-3",
|
|
86
|
+
"title": "A Randomized Controlled Trial...",
|
|
87
|
+
"year": 2020,
|
|
88
|
+
"authors": ["Smith J", "Jones A"],
|
|
89
|
+
"journal": "Nature",
|
|
90
|
+
"similarity_score": 100.0
|
|
91
|
+
},
|
|
92
|
+
...
|
|
93
|
+
],
|
|
94
|
+
"edges": [
|
|
95
|
+
{
|
|
96
|
+
"source": "10.1038/s41586-020-2008-3",
|
|
97
|
+
"target": "10.1016/j.cell.2019.11.025",
|
|
98
|
+
"type": "cites"
|
|
99
|
+
},
|
|
100
|
+
...
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Future Enhancements
|
|
106
|
+
|
|
107
|
+
- [ ] Redis caching for popular papers
|
|
108
|
+
- [ ] Async/parallel query execution
|
|
109
|
+
- [ ] Additional similarity metrics (topic modeling, author networks)
|
|
110
|
+
- [ ] GraphQL API
|
|
111
|
+
- [ ] Real-time updates
|
|
112
|
+
|
|
113
|
+
## References
|
|
114
|
+
|
|
115
|
+
- Co-citation: Small, H. (1973). Co-citation in the scientific literature. *J. Am. Soc. Inf. Sci.*
|
|
116
|
+
- Bibliographic coupling: Kessler, M. M. (1963). Bibliographic coupling. *American Documentation*
|
|
117
|
+
- Connected Papers (inspiration): https://www.connectedpapers.com/
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Citation Graph Module
|
|
3
|
+
|
|
4
|
+
Build and analyze citation networks for academic papers using CrossRef data.
|
|
5
|
+
|
|
6
|
+
This module provides tools to:
|
|
7
|
+
- Extract citation relationships
|
|
8
|
+
- Calculate paper similarity (co-citation, bibliographic coupling)
|
|
9
|
+
- Build citation network graphs
|
|
10
|
+
- Export for visualization (D3.js, vis.js, Cytoscape)
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> from scitex.scholar.citation_graph import CitationGraphBuilder
|
|
14
|
+
>>>
|
|
15
|
+
>>> builder = CitationGraphBuilder(db_path="/path/to/crossref.db")
|
|
16
|
+
>>> graph = builder.build("10.1038/s41586-020-2008-3", top_n=20)
|
|
17
|
+
>>> builder.export_json(graph, "network.json")
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .builder import CitationGraphBuilder
|
|
21
|
+
from .models import PaperNode, CitationEdge, CitationGraph
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0"
|
|
24
|
+
__all__ = [
|
|
25
|
+
"CitationGraphBuilder",
|
|
26
|
+
"PaperNode",
|
|
27
|
+
"CitationEdge",
|
|
28
|
+
"CitationGraph",
|
|
29
|
+
]
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Citation Graph Builder
|
|
3
|
+
|
|
4
|
+
Main interface for building citation networks from CrossRef data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, List
|
|
10
|
+
from collections import Counter
|
|
11
|
+
|
|
12
|
+
from .database import CitationDatabase
|
|
13
|
+
from .models import PaperNode, CitationEdge, CitationGraph
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CitationGraphBuilder:
|
|
17
|
+
"""
|
|
18
|
+
Build citation network graphs for academic papers.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
>>> builder = CitationGraphBuilder("/path/to/crossref.db")
|
|
22
|
+
>>> graph = builder.build("10.1038/s41586-020-2008-3", top_n=20)
|
|
23
|
+
>>> builder.export_json(graph, "network.json")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, db_path: str):
|
|
27
|
+
"""
|
|
28
|
+
Initialize builder with database path.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
db_path: Path to CrossRef SQLite database
|
|
32
|
+
"""
|
|
33
|
+
self.db_path = db_path
|
|
34
|
+
self.db = CitationDatabase(db_path)
|
|
35
|
+
|
|
36
|
+
def build(
|
|
37
|
+
self,
|
|
38
|
+
seed_doi: str,
|
|
39
|
+
top_n: int = 20,
|
|
40
|
+
weight_coupling: float = 2.0,
|
|
41
|
+
weight_cocitation: float = 2.0,
|
|
42
|
+
weight_direct: float = 1.0,
|
|
43
|
+
) -> CitationGraph:
|
|
44
|
+
"""
|
|
45
|
+
Build citation network around a seed paper.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
seed_doi: DOI of the seed paper
|
|
49
|
+
top_n: Number of most similar papers to include
|
|
50
|
+
weight_coupling: Weight for bibliographic coupling
|
|
51
|
+
weight_cocitation: Weight for co-citation
|
|
52
|
+
weight_direct: Weight for direct citations
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
CitationGraph object with nodes and edges
|
|
56
|
+
"""
|
|
57
|
+
with self.db:
|
|
58
|
+
# Calculate similarity scores
|
|
59
|
+
scores = self.db.get_combined_similarity_scores(
|
|
60
|
+
seed_doi,
|
|
61
|
+
weight_coupling=weight_coupling,
|
|
62
|
+
weight_cocitation=weight_cocitation,
|
|
63
|
+
weight_direct=weight_direct,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Get top N most similar papers
|
|
67
|
+
top_dois = [seed_doi] + [doi for doi, _ in scores.most_common(top_n)]
|
|
68
|
+
|
|
69
|
+
# Build nodes with metadata
|
|
70
|
+
nodes = []
|
|
71
|
+
for doi in top_dois:
|
|
72
|
+
node = self._create_paper_node(doi, scores.get(doi, 100.0))
|
|
73
|
+
nodes.append(node)
|
|
74
|
+
|
|
75
|
+
# Build edges (citations between papers in network)
|
|
76
|
+
edges = self._build_citation_edges(top_dois)
|
|
77
|
+
|
|
78
|
+
# Create graph
|
|
79
|
+
graph = CitationGraph(
|
|
80
|
+
seed_doi=seed_doi,
|
|
81
|
+
nodes=nodes,
|
|
82
|
+
edges=edges,
|
|
83
|
+
metadata={
|
|
84
|
+
"top_n": top_n,
|
|
85
|
+
"weights": {
|
|
86
|
+
"coupling": weight_coupling,
|
|
87
|
+
"cocitation": weight_cocitation,
|
|
88
|
+
"direct": weight_direct,
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return graph
|
|
94
|
+
|
|
95
|
+
def _create_paper_node(
|
|
96
|
+
self, doi: str, similarity_score: float
|
|
97
|
+
) -> PaperNode:
|
|
98
|
+
"""
|
|
99
|
+
Create a PaperNode with metadata from database.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
doi: DOI of the paper
|
|
103
|
+
similarity_score: Calculated similarity score
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
PaperNode object
|
|
107
|
+
"""
|
|
108
|
+
metadata = self.db.get_paper_metadata(doi)
|
|
109
|
+
|
|
110
|
+
if metadata:
|
|
111
|
+
# Extract author names
|
|
112
|
+
authors = metadata.get("author", [])
|
|
113
|
+
author_names = [
|
|
114
|
+
f"{a.get('family', '')} {a.get('given', '')[:1]}"
|
|
115
|
+
for a in authors[:3]
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# Extract year
|
|
119
|
+
year = 0
|
|
120
|
+
if "published" in metadata and "date-parts" in metadata["published"]:
|
|
121
|
+
date_parts = metadata["published"]["date-parts"]
|
|
122
|
+
if date_parts and date_parts[0]:
|
|
123
|
+
year = date_parts[0][0] if date_parts[0][0] else 0
|
|
124
|
+
|
|
125
|
+
# Extract journal
|
|
126
|
+
journal = ""
|
|
127
|
+
if "container-title" in metadata and metadata["container-title"]:
|
|
128
|
+
journal = metadata["container-title"][0]
|
|
129
|
+
|
|
130
|
+
return PaperNode(
|
|
131
|
+
doi=doi,
|
|
132
|
+
title=metadata.get("title", ["Unknown"])[0][:200],
|
|
133
|
+
year=year,
|
|
134
|
+
authors=author_names,
|
|
135
|
+
journal=journal,
|
|
136
|
+
similarity_score=similarity_score,
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
return PaperNode(doi=doi, similarity_score=similarity_score)
|
|
140
|
+
|
|
141
|
+
def _build_citation_edges(self, dois: List[str]) -> List[CitationEdge]:
|
|
142
|
+
"""
|
|
143
|
+
Build citation edges between papers in the network.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
dois: List of DOIs in the network
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of CitationEdge objects
|
|
150
|
+
"""
|
|
151
|
+
edges = []
|
|
152
|
+
doi_set = set(d.lower() for d in dois)
|
|
153
|
+
|
|
154
|
+
for doi in dois:
|
|
155
|
+
# Get references (papers this one cites)
|
|
156
|
+
refs = self.db.get_references(doi, limit=100)
|
|
157
|
+
|
|
158
|
+
for cited_doi in refs:
|
|
159
|
+
if cited_doi in doi_set:
|
|
160
|
+
edges.append(
|
|
161
|
+
CitationEdge(
|
|
162
|
+
source=doi,
|
|
163
|
+
target=cited_doi,
|
|
164
|
+
edge_type="cites",
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return edges
|
|
169
|
+
|
|
170
|
+
def export_json(self, graph: CitationGraph, output_path: str):
|
|
171
|
+
"""
|
|
172
|
+
Export graph to JSON file for visualization.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
graph: CitationGraph to export
|
|
176
|
+
output_path: Path to output JSON file
|
|
177
|
+
"""
|
|
178
|
+
output = Path(output_path)
|
|
179
|
+
with open(output, "w") as f:
|
|
180
|
+
json.dump(graph.to_dict(), f, indent=2)
|
|
181
|
+
|
|
182
|
+
def get_paper_summary(self, doi: str) -> Optional[dict]:
|
|
183
|
+
"""
|
|
184
|
+
Get summary information for a paper.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
doi: DOI of the paper
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dictionary with paper summary
|
|
191
|
+
"""
|
|
192
|
+
with self.db:
|
|
193
|
+
metadata = self.db.get_paper_metadata(doi)
|
|
194
|
+
|
|
195
|
+
if not metadata:
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
# Get citation counts
|
|
199
|
+
refs = self.db.get_references(doi, limit=1000)
|
|
200
|
+
citations = self.db.get_citations(doi, limit=1000)
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"doi": doi,
|
|
204
|
+
"title": metadata.get("title", ["Unknown"])[0],
|
|
205
|
+
"year": metadata.get("published", {})
|
|
206
|
+
.get("date-parts", [[0]])[0][0],
|
|
207
|
+
"authors": [
|
|
208
|
+
f"{a.get('family', '')} {a.get('given', '')}"
|
|
209
|
+
for a in metadata.get("author", [])[:5]
|
|
210
|
+
],
|
|
211
|
+
"journal": metadata.get("container-title", ["Unknown"])[0],
|
|
212
|
+
"reference_count": len(refs),
|
|
213
|
+
"citation_count": len(citations),
|
|
214
|
+
}
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database access layer for citation graph queries.
|
|
3
|
+
|
|
4
|
+
Handles all SQL queries to the CrossRef SQLite database with
|
|
5
|
+
optimized queries and connection management.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sqlite3
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Tuple, Dict, Optional
|
|
12
|
+
from collections import Counter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CitationDatabase:
|
|
16
|
+
"""
|
|
17
|
+
Database interface for citation graph operations.
|
|
18
|
+
|
|
19
|
+
Provides optimized queries for:
|
|
20
|
+
- Citation extraction (forward/reverse)
|
|
21
|
+
- Co-citation analysis
|
|
22
|
+
- Bibliographic coupling
|
|
23
|
+
- Paper metadata lookup
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, db_path: str):
|
|
27
|
+
"""
|
|
28
|
+
Initialize database connection.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
db_path: Path to CrossRef SQLite database
|
|
32
|
+
"""
|
|
33
|
+
self.db_path = Path(db_path)
|
|
34
|
+
if not self.db_path.exists():
|
|
35
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
36
|
+
|
|
37
|
+
self.conn = None
|
|
38
|
+
|
|
39
|
+
def connect(self, read_only: bool = True):
|
|
40
|
+
"""
|
|
41
|
+
Connect to database.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
read_only: If True, open in read-only mode (default)
|
|
45
|
+
"""
|
|
46
|
+
if read_only:
|
|
47
|
+
self.conn = sqlite3.connect(
|
|
48
|
+
f"file:{self.db_path}?mode=ro",
|
|
49
|
+
uri=True,
|
|
50
|
+
check_same_thread=False # Allow multi-threaded access (e.g., Django)
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
self.conn = sqlite3.connect(
|
|
54
|
+
self.db_path,
|
|
55
|
+
check_same_thread=False
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self.conn.row_factory = sqlite3.Row
|
|
59
|
+
|
|
60
|
+
def close(self):
|
|
61
|
+
"""Close database connection."""
|
|
62
|
+
if self.conn:
|
|
63
|
+
self.conn.close()
|
|
64
|
+
self.conn = None
|
|
65
|
+
|
|
66
|
+
def __enter__(self):
|
|
67
|
+
"""Context manager entry."""
|
|
68
|
+
self.connect()
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
72
|
+
"""Context manager exit."""
|
|
73
|
+
self.close()
|
|
74
|
+
|
|
75
|
+
def get_references(self, doi: str, limit: int = 100) -> List[str]:
|
|
76
|
+
"""
|
|
77
|
+
Get papers cited by this DOI (forward citations).
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
doi: DOI of the paper
|
|
81
|
+
limit: Maximum number of references to return
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of DOIs cited by the paper
|
|
85
|
+
"""
|
|
86
|
+
cursor = self.conn.execute(
|
|
87
|
+
"""
|
|
88
|
+
SELECT cited_doi
|
|
89
|
+
FROM citations
|
|
90
|
+
WHERE citing_doi = ?
|
|
91
|
+
LIMIT ?
|
|
92
|
+
""",
|
|
93
|
+
(doi.lower(), limit),
|
|
94
|
+
)
|
|
95
|
+
return [row[0] for row in cursor]
|
|
96
|
+
|
|
97
|
+
def get_citations(self, doi: str, limit: int = 100) -> List[Tuple[str, int]]:
|
|
98
|
+
"""
|
|
99
|
+
Get papers that cite this DOI (reverse citations).
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
doi: DOI of the paper
|
|
103
|
+
limit: Maximum number of citations to return
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of (citing_doi, year) tuples
|
|
107
|
+
"""
|
|
108
|
+
cursor = self.conn.execute(
|
|
109
|
+
"""
|
|
110
|
+
SELECT citing_doi, citing_year
|
|
111
|
+
FROM citations
|
|
112
|
+
WHERE cited_doi = ?
|
|
113
|
+
ORDER BY citing_year DESC
|
|
114
|
+
LIMIT ?
|
|
115
|
+
""",
|
|
116
|
+
(doi.lower(), limit),
|
|
117
|
+
)
|
|
118
|
+
return [(row[0], row[1]) for row in cursor]
|
|
119
|
+
|
|
120
|
+
def get_cocited_papers(
|
|
121
|
+
self, doi: str, limit: int = 50
|
|
122
|
+
) -> List[Tuple[str, int]]:
|
|
123
|
+
"""
|
|
124
|
+
Find papers co-cited with this DOI.
|
|
125
|
+
|
|
126
|
+
Papers are co-cited if they appear together in reference lists.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
doi: DOI of the paper
|
|
130
|
+
limit: Maximum number of results
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of (cocited_doi, cocitation_count) tuples
|
|
134
|
+
"""
|
|
135
|
+
cursor = self.conn.execute(
|
|
136
|
+
"""
|
|
137
|
+
SELECT c2.cited_doi, COUNT(*) as cocitation_count
|
|
138
|
+
FROM citations c1
|
|
139
|
+
JOIN citations c2 ON c1.citing_doi = c2.citing_doi
|
|
140
|
+
WHERE c1.cited_doi = ?
|
|
141
|
+
AND c2.cited_doi != ?
|
|
142
|
+
GROUP BY c2.cited_doi
|
|
143
|
+
ORDER BY cocitation_count DESC
|
|
144
|
+
LIMIT ?
|
|
145
|
+
""",
|
|
146
|
+
(doi.lower(), doi.lower(), limit),
|
|
147
|
+
)
|
|
148
|
+
return [(row[0], row[1]) for row in cursor]
|
|
149
|
+
|
|
150
|
+
def get_bibliographic_coupled_papers(
|
|
151
|
+
self, doi: str, limit: int = 50
|
|
152
|
+
) -> List[Tuple[str, int]]:
|
|
153
|
+
"""
|
|
154
|
+
Find papers with similar references (bibliographic coupling).
|
|
155
|
+
|
|
156
|
+
Papers are bibliographically coupled if they cite the same references.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
doi: DOI of the paper
|
|
160
|
+
limit: Maximum number of results
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List of (coupled_doi, shared_references_count) tuples
|
|
164
|
+
"""
|
|
165
|
+
cursor = self.conn.execute(
|
|
166
|
+
"""
|
|
167
|
+
SELECT c2.citing_doi, COUNT(*) as shared_refs
|
|
168
|
+
FROM citations c1
|
|
169
|
+
JOIN citations c2 ON c1.cited_doi = c2.cited_doi
|
|
170
|
+
WHERE c1.citing_doi = ?
|
|
171
|
+
AND c2.citing_doi != ?
|
|
172
|
+
GROUP BY c2.citing_doi
|
|
173
|
+
ORDER BY shared_refs DESC
|
|
174
|
+
LIMIT ?
|
|
175
|
+
""",
|
|
176
|
+
(doi.lower(), doi.lower(), limit),
|
|
177
|
+
)
|
|
178
|
+
return [(row[0], row[1]) for row in cursor]
|
|
179
|
+
|
|
180
|
+
def get_paper_metadata(self, doi: str) -> Optional[Dict]:
|
|
181
|
+
"""
|
|
182
|
+
Get metadata for a paper from works table.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
doi: DOI of the paper
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Dictionary with paper metadata, or None if not found
|
|
189
|
+
"""
|
|
190
|
+
cursor = self.conn.execute(
|
|
191
|
+
"SELECT metadata FROM works WHERE doi = ?", (doi,)
|
|
192
|
+
)
|
|
193
|
+
row = cursor.fetchone()
|
|
194
|
+
|
|
195
|
+
if row:
|
|
196
|
+
return json.loads(row[0])
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
def get_combined_similarity_scores(
|
|
200
|
+
self,
|
|
201
|
+
seed_doi: str,
|
|
202
|
+
weight_coupling: float = 2.0,
|
|
203
|
+
weight_cocitation: float = 2.0,
|
|
204
|
+
weight_direct: float = 1.0,
|
|
205
|
+
max_papers: int = 100,
|
|
206
|
+
) -> Counter:
|
|
207
|
+
"""
|
|
208
|
+
Calculate combined similarity scores using multiple metrics.
|
|
209
|
+
|
|
210
|
+
Combines:
|
|
211
|
+
- Bibliographic coupling (shared references)
|
|
212
|
+
- Co-citation (cited together)
|
|
213
|
+
- Direct citations (cites or is cited by)
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
seed_doi: DOI of the seed paper
|
|
217
|
+
weight_coupling: Weight for bibliographic coupling score
|
|
218
|
+
weight_cocitation: Weight for co-citation score
|
|
219
|
+
weight_direct: Weight for direct citation score
|
|
220
|
+
max_papers: Maximum papers to consider per metric
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Counter with {doi: combined_score}
|
|
224
|
+
"""
|
|
225
|
+
scores = Counter()
|
|
226
|
+
|
|
227
|
+
# 1. Bibliographic coupling
|
|
228
|
+
coupled = self.get_bibliographic_coupled_papers(seed_doi, limit=max_papers)
|
|
229
|
+
for doi, count in coupled:
|
|
230
|
+
scores[doi] += count * weight_coupling
|
|
231
|
+
|
|
232
|
+
# 2. Co-citation
|
|
233
|
+
cocited = self.get_cocited_papers(seed_doi, limit=max_papers)
|
|
234
|
+
for doi, count in cocited:
|
|
235
|
+
scores[doi] += count * weight_cocitation
|
|
236
|
+
|
|
237
|
+
# 3. Direct citations
|
|
238
|
+
refs = self.get_references(seed_doi, limit=50)
|
|
239
|
+
for doi in refs:
|
|
240
|
+
scores[doi] += weight_direct
|
|
241
|
+
|
|
242
|
+
citations = self.get_citations(seed_doi, limit=50)
|
|
243
|
+
for doi, _ in citations:
|
|
244
|
+
scores[doi] += weight_direct
|
|
245
|
+
|
|
246
|
+
return scores
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example usage of the citation_graph module.
|
|
4
|
+
|
|
5
|
+
Run this from the scitex-code root:
|
|
6
|
+
python -m scitex.scholar.citation_graph.example
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
# Add parent directory to path for imports
|
|
13
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
14
|
+
|
|
15
|
+
from scitex.scholar.citation_graph import CitationGraphBuilder
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def main():
|
|
19
|
+
# Database path (adjust to your setup)
|
|
20
|
+
db_path = Path.home() / "proj/crossref_local/data/crossref.db"
|
|
21
|
+
|
|
22
|
+
if not db_path.exists():
|
|
23
|
+
print(f"❌ Database not found: {db_path}")
|
|
24
|
+
print("Please update the db_path in this script.")
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
print("="*70)
|
|
28
|
+
print(" Citation Graph Example")
|
|
29
|
+
print("="*70)
|
|
30
|
+
print(f"\nDatabase: {db_path}")
|
|
31
|
+
|
|
32
|
+
# Initialize builder
|
|
33
|
+
builder = CitationGraphBuilder(str(db_path))
|
|
34
|
+
|
|
35
|
+
# Example DOI (a well-cited paper)
|
|
36
|
+
seed_doi = "10.1001/2013.jamapsychiatry.4"
|
|
37
|
+
|
|
38
|
+
# Get paper summary
|
|
39
|
+
print(f"\n1. Getting paper summary for {seed_doi}...")
|
|
40
|
+
summary = builder.get_paper_summary(seed_doi)
|
|
41
|
+
|
|
42
|
+
if summary:
|
|
43
|
+
print(f"\nPaper: {summary['title']}")
|
|
44
|
+
print(f"Authors: {', '.join(summary['authors'][:3])}")
|
|
45
|
+
print(f"Year: {summary['year']}")
|
|
46
|
+
print(f"Journal: {summary['journal']}")
|
|
47
|
+
print(f"References: {summary['reference_count']}")
|
|
48
|
+
print(f"Citations: {summary['citation_count']}")
|
|
49
|
+
else:
|
|
50
|
+
print("Paper not found in database")
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
# Build citation network
|
|
54
|
+
print(f"\n2. Building citation network (top 20 papers)...")
|
|
55
|
+
graph = builder.build(seed_doi, top_n=20)
|
|
56
|
+
|
|
57
|
+
print(f"\nNetwork built:")
|
|
58
|
+
print(f" Nodes: {graph.node_count}")
|
|
59
|
+
print(f" Edges: {graph.edge_count}")
|
|
60
|
+
|
|
61
|
+
# Show top papers by similarity
|
|
62
|
+
print(f"\nTop 10 most similar papers:")
|
|
63
|
+
print(f"{'Rank':<5} {'Score':<7} {'Year':<6} {'Title':<60}")
|
|
64
|
+
print("-"*85)
|
|
65
|
+
|
|
66
|
+
sorted_nodes = sorted(
|
|
67
|
+
graph.nodes,
|
|
68
|
+
key=lambda n: n.similarity_score,
|
|
69
|
+
reverse=True
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for i, node in enumerate(sorted_nodes[:11], 1):
|
|
73
|
+
if node.doi.lower() == seed_doi.lower():
|
|
74
|
+
continue
|
|
75
|
+
print(
|
|
76
|
+
f"{i:<5} {node.similarity_score:<7.1f} "
|
|
77
|
+
f"{node.year:<6} {node.title[:60]:<60}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Export to JSON
|
|
81
|
+
output_path = Path(__file__).parent / "example_output.json"
|
|
82
|
+
builder.export_json(graph, str(output_path))
|
|
83
|
+
print(f"\n3. Network exported to: {output_path}")
|
|
84
|
+
print(f" File size: {output_path.stat().st_size / 1024:.1f} KB")
|
|
85
|
+
|
|
86
|
+
print("\n✅ Example complete!")
|
|
87
|
+
print("\nNext steps:")
|
|
88
|
+
print(" - Open example_output.json to see the graph data")
|
|
89
|
+
print(" - Use this JSON with D3.js, vis.js, or Cytoscape for visualization")
|
|
90
|
+
print(" - Integrate with scitex-cloud for API endpoints")
|
|
91
|
+
|
|
92
|
+
return 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
sys.exit(main())
|