crossref-local 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crossref_local/__init__.py +78 -0
- crossref_local/aio.py +236 -0
- crossref_local/api.py +153 -0
- crossref_local/citations.py +413 -0
- crossref_local/cli.py +257 -0
- crossref_local/config.py +72 -0
- crossref_local/db.py +136 -0
- crossref_local/fts.py +138 -0
- crossref_local/impact_factor/__init__.py +20 -0
- crossref_local/impact_factor/calculator.py +479 -0
- crossref_local/impact_factor/journal_lookup.py +274 -0
- crossref_local/models.py +186 -0
- crossref_local-0.3.0.dist-info/METADATA +200 -0
- crossref_local-0.3.0.dist-info/RECORD +16 -0
- crossref_local-0.3.0.dist-info/WHEEL +4 -0
- crossref_local-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Journal lookup module for fast name-to-ISSN resolution.
|
|
4
|
+
|
|
5
|
+
Uses OpenAlex journals table (222k journals with IF proxy) for fast lookups.
|
|
6
|
+
Falls back to direct database query if table doesn't exist.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import sqlite3
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class JournalLookup:
|
|
18
|
+
"""
|
|
19
|
+
Fast journal name to ISSN lookup.
|
|
20
|
+
|
|
21
|
+
Uses journals_openalex table for O(1) lookups with IF proxy data.
|
|
22
|
+
Falls back to slow works table scan if OpenAlex table doesn't exist.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, db_path: str):
|
|
26
|
+
"""
|
|
27
|
+
Initialize journal lookup.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
db_path: Path to CrossRef SQLite database
|
|
31
|
+
"""
|
|
32
|
+
self.db_path = db_path
|
|
33
|
+
self.conn = sqlite3.connect(db_path)
|
|
34
|
+
self.conn.row_factory = sqlite3.Row
|
|
35
|
+
self._openalex_exists = self._check_openalex_table()
|
|
36
|
+
self._issn_lookup_exists = self._check_issn_lookup_table()
|
|
37
|
+
|
|
38
|
+
if self._openalex_exists:
|
|
39
|
+
logger.info("Using journals_openalex table for fast lookups")
|
|
40
|
+
else:
|
|
41
|
+
logger.warning(
|
|
42
|
+
"journals_openalex table not found. "
|
|
43
|
+
"Run download_openalex_journals.py for fast lookups. "
|
|
44
|
+
"Falling back to slow query."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def _check_openalex_table(self) -> bool:
|
|
48
|
+
"""Check if OpenAlex journals table exists."""
|
|
49
|
+
cursor = self.conn.execute("""
|
|
50
|
+
SELECT name FROM sqlite_master
|
|
51
|
+
WHERE type='table' AND name='journals_openalex'
|
|
52
|
+
""")
|
|
53
|
+
return cursor.fetchone() is not None
|
|
54
|
+
|
|
55
|
+
def _check_issn_lookup_table(self) -> bool:
|
|
56
|
+
"""Check if ISSN lookup table exists."""
|
|
57
|
+
cursor = self.conn.execute("""
|
|
58
|
+
SELECT name FROM sqlite_master
|
|
59
|
+
WHERE type='table' AND name='issn_lookup'
|
|
60
|
+
""")
|
|
61
|
+
return cursor.fetchone() is not None
|
|
62
|
+
|
|
63
|
+
def get_issn(self, journal_name: str, strict: bool = True) -> Optional[str]:
|
|
64
|
+
"""
|
|
65
|
+
Get ISSN for a journal name.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
journal_name: Journal name (case-insensitive)
|
|
69
|
+
strict: If True, only exact matches. If False, allow partial matches.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
ISSN string or None if not found
|
|
73
|
+
"""
|
|
74
|
+
if self._openalex_exists:
|
|
75
|
+
return self._get_issn_openalex(journal_name, strict)
|
|
76
|
+
else:
|
|
77
|
+
return self._get_issn_slow(journal_name, strict)
|
|
78
|
+
|
|
79
|
+
def _get_issn_openalex(self, journal_name: str, strict: bool = True) -> Optional[str]:
|
|
80
|
+
"""Fast lookup using OpenAlex journals table."""
|
|
81
|
+
# Try exact match first
|
|
82
|
+
cursor = self.conn.execute("""
|
|
83
|
+
SELECT issn_l FROM journals_openalex
|
|
84
|
+
WHERE name_lower = ?
|
|
85
|
+
LIMIT 1
|
|
86
|
+
""", (journal_name.lower(),))
|
|
87
|
+
|
|
88
|
+
result = cursor.fetchone()
|
|
89
|
+
if result and result[0]:
|
|
90
|
+
return result[0]
|
|
91
|
+
|
|
92
|
+
# If strict mode, don't try partial match
|
|
93
|
+
if strict:
|
|
94
|
+
logger.debug(f"Strict mode: no exact match for '{journal_name}'")
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
# Try partial match (only if not strict)
|
|
98
|
+
logger.warning(f"Using partial match for '{journal_name}' - results may be inaccurate")
|
|
99
|
+
cursor = self.conn.execute("""
|
|
100
|
+
SELECT issn_l, name FROM journals_openalex
|
|
101
|
+
WHERE name_lower LIKE ?
|
|
102
|
+
ORDER BY works_count DESC
|
|
103
|
+
LIMIT 1
|
|
104
|
+
""", (f"%{journal_name.lower()}%",))
|
|
105
|
+
|
|
106
|
+
result = cursor.fetchone()
|
|
107
|
+
if result and result[0]:
|
|
108
|
+
logger.warning(f" Matched to: '{result[1]}'")
|
|
109
|
+
return result[0]
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def _get_issn_slow(self, journal_name: str, strict: bool = True) -> Optional[str]:
|
|
113
|
+
"""Slow lookup by scanning works table."""
|
|
114
|
+
if strict:
|
|
115
|
+
# Exact match
|
|
116
|
+
cursor = self.conn.execute("""
|
|
117
|
+
SELECT DISTINCT json_extract(metadata, '$.ISSN[0]') as issn
|
|
118
|
+
FROM works
|
|
119
|
+
WHERE json_extract(metadata, '$.container-title[0]') = ?
|
|
120
|
+
AND json_extract(metadata, '$.ISSN[0]') IS NOT NULL
|
|
121
|
+
LIMIT 1
|
|
122
|
+
""", (journal_name,))
|
|
123
|
+
else:
|
|
124
|
+
# Partial match
|
|
125
|
+
cursor = self.conn.execute("""
|
|
126
|
+
SELECT DISTINCT json_extract(metadata, '$.ISSN[0]') as issn
|
|
127
|
+
FROM works
|
|
128
|
+
WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
|
|
129
|
+
AND json_extract(metadata, '$.ISSN[0]') IS NOT NULL
|
|
130
|
+
LIMIT 1
|
|
131
|
+
""", (f"%{journal_name}%",))
|
|
132
|
+
|
|
133
|
+
result = cursor.fetchone()
|
|
134
|
+
return result[0] if result else None
|
|
135
|
+
|
|
136
|
+
def search(self, query: str, limit: int = 10) -> List[Dict]:
|
|
137
|
+
"""
|
|
138
|
+
Search for journals by name.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
query: Search query (partial name match)
|
|
142
|
+
limit: Maximum results to return
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of journal info dictionaries with IF proxy
|
|
146
|
+
"""
|
|
147
|
+
if not self._openalex_exists:
|
|
148
|
+
return []
|
|
149
|
+
|
|
150
|
+
cursor = self.conn.execute("""
|
|
151
|
+
SELECT name, issn_l, publisher, works_count,
|
|
152
|
+
two_year_mean_citedness, h_index
|
|
153
|
+
FROM journals_openalex
|
|
154
|
+
WHERE name_lower LIKE ?
|
|
155
|
+
ORDER BY works_count DESC
|
|
156
|
+
LIMIT ?
|
|
157
|
+
""", (f"%{query.lower()}%", limit))
|
|
158
|
+
|
|
159
|
+
return [
|
|
160
|
+
{
|
|
161
|
+
"name": row["name"],
|
|
162
|
+
"issn": row["issn_l"],
|
|
163
|
+
"publisher": row["publisher"],
|
|
164
|
+
"works_count": row["works_count"],
|
|
165
|
+
"if_proxy": row["two_year_mean_citedness"],
|
|
166
|
+
"h_index": row["h_index"]
|
|
167
|
+
}
|
|
168
|
+
for row in cursor.fetchall()
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
def get_info(self, issn: str) -> Optional[Dict]:
|
|
172
|
+
"""
|
|
173
|
+
Get journal info by ISSN.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
issn: Journal ISSN
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Journal info dictionary with IF proxy or None
|
|
180
|
+
"""
|
|
181
|
+
if not self._openalex_exists:
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
# Try direct ISSN-L match
|
|
185
|
+
cursor = self.conn.execute("""
|
|
186
|
+
SELECT name, issn_l, issns, publisher, works_count,
|
|
187
|
+
two_year_mean_citedness, h_index, is_oa
|
|
188
|
+
FROM journals_openalex
|
|
189
|
+
WHERE issn_l = ?
|
|
190
|
+
LIMIT 1
|
|
191
|
+
""", (issn,))
|
|
192
|
+
|
|
193
|
+
row = cursor.fetchone()
|
|
194
|
+
|
|
195
|
+
# If not found, try issn_lookup table
|
|
196
|
+
if not row and self._issn_lookup_exists:
|
|
197
|
+
cursor = self.conn.execute("""
|
|
198
|
+
SELECT jo.name, jo.issn_l, jo.issns, jo.publisher, jo.works_count,
|
|
199
|
+
jo.two_year_mean_citedness, jo.h_index, jo.is_oa
|
|
200
|
+
FROM issn_lookup il
|
|
201
|
+
JOIN journals_openalex jo ON il.journal_id = jo.id
|
|
202
|
+
WHERE il.issn = ?
|
|
203
|
+
LIMIT 1
|
|
204
|
+
""", (issn,))
|
|
205
|
+
row = cursor.fetchone()
|
|
206
|
+
|
|
207
|
+
if row:
|
|
208
|
+
issns = []
|
|
209
|
+
if row["issns"]:
|
|
210
|
+
try:
|
|
211
|
+
issns = json.loads(row["issns"])
|
|
212
|
+
except:
|
|
213
|
+
pass
|
|
214
|
+
return {
|
|
215
|
+
"name": row["name"],
|
|
216
|
+
"issn": row["issn_l"],
|
|
217
|
+
"issns": issns,
|
|
218
|
+
"publisher": row["publisher"],
|
|
219
|
+
"works_count": row["works_count"],
|
|
220
|
+
"if_proxy": row["two_year_mean_citedness"],
|
|
221
|
+
"h_index": row["h_index"],
|
|
222
|
+
"is_oa": row["is_oa"]
|
|
223
|
+
}
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
def get_if_proxy(self, journal_name: str, strict: bool = True) -> Optional[float]:
|
|
227
|
+
"""
|
|
228
|
+
Get OpenAlex Impact Factor proxy for a journal.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
journal_name: Journal name
|
|
232
|
+
strict: If True, only exact matches
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
2-year mean citedness (IF proxy) or None
|
|
236
|
+
"""
|
|
237
|
+
if not self._openalex_exists:
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
# Try exact match
|
|
241
|
+
cursor = self.conn.execute("""
|
|
242
|
+
SELECT two_year_mean_citedness FROM journals_openalex
|
|
243
|
+
WHERE name_lower = ?
|
|
244
|
+
LIMIT 1
|
|
245
|
+
""", (journal_name.lower(),))
|
|
246
|
+
|
|
247
|
+
result = cursor.fetchone()
|
|
248
|
+
if result and result[0]:
|
|
249
|
+
return result[0]
|
|
250
|
+
|
|
251
|
+
if strict:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
# Try partial match (only if not strict)
|
|
255
|
+
cursor = self.conn.execute("""
|
|
256
|
+
SELECT two_year_mean_citedness FROM journals_openalex
|
|
257
|
+
WHERE name_lower LIKE ?
|
|
258
|
+
ORDER BY works_count DESC
|
|
259
|
+
LIMIT 1
|
|
260
|
+
""", (f"%{journal_name.lower()}%",))
|
|
261
|
+
|
|
262
|
+
result = cursor.fetchone()
|
|
263
|
+
return result[0] if result and result[0] else None
|
|
264
|
+
|
|
265
|
+
def close(self):
|
|
266
|
+
"""Close database connection."""
|
|
267
|
+
if self.conn:
|
|
268
|
+
self.conn.close()
|
|
269
|
+
|
|
270
|
+
def __enter__(self):
|
|
271
|
+
return self
|
|
272
|
+
|
|
273
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
274
|
+
self.close()
|
crossref_local/models.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Data models for crossref_local."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Work:
|
|
10
|
+
"""
|
|
11
|
+
Represents a scholarly work from CrossRef.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
doi: Digital Object Identifier
|
|
15
|
+
title: Work title
|
|
16
|
+
authors: List of author names
|
|
17
|
+
year: Publication year
|
|
18
|
+
journal: Journal/container title
|
|
19
|
+
issn: Journal ISSN
|
|
20
|
+
volume: Volume number
|
|
21
|
+
issue: Issue number
|
|
22
|
+
page: Page range
|
|
23
|
+
publisher: Publisher name
|
|
24
|
+
type: Work type (journal-article, book-chapter, etc.)
|
|
25
|
+
abstract: Abstract text (if available)
|
|
26
|
+
url: Resource URL
|
|
27
|
+
citation_count: Number of citations (if available)
|
|
28
|
+
references: List of reference DOIs
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
doi: str
|
|
32
|
+
title: Optional[str] = None
|
|
33
|
+
authors: List[str] = field(default_factory=list)
|
|
34
|
+
year: Optional[int] = None
|
|
35
|
+
journal: Optional[str] = None
|
|
36
|
+
issn: Optional[str] = None
|
|
37
|
+
volume: Optional[str] = None
|
|
38
|
+
issue: Optional[str] = None
|
|
39
|
+
page: Optional[str] = None
|
|
40
|
+
publisher: Optional[str] = None
|
|
41
|
+
type: Optional[str] = None
|
|
42
|
+
abstract: Optional[str] = None
|
|
43
|
+
url: Optional[str] = None
|
|
44
|
+
citation_count: Optional[int] = None
|
|
45
|
+
references: List[str] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_metadata(cls, doi: str, metadata: dict) -> "Work":
|
|
49
|
+
"""
|
|
50
|
+
Create Work from CrossRef metadata JSON.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
doi: DOI string
|
|
54
|
+
metadata: CrossRef metadata dictionary
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Work instance
|
|
58
|
+
"""
|
|
59
|
+
# Extract authors
|
|
60
|
+
authors = []
|
|
61
|
+
for author in metadata.get("author", []):
|
|
62
|
+
given = author.get("given", "")
|
|
63
|
+
family = author.get("family", "")
|
|
64
|
+
if given and family:
|
|
65
|
+
authors.append(f"{given} {family}")
|
|
66
|
+
elif family:
|
|
67
|
+
authors.append(family)
|
|
68
|
+
elif author.get("name"):
|
|
69
|
+
authors.append(author["name"])
|
|
70
|
+
|
|
71
|
+
# Extract year from published date
|
|
72
|
+
year = None
|
|
73
|
+
published = metadata.get("published", {})
|
|
74
|
+
date_parts = published.get("date-parts", [[]])
|
|
75
|
+
if date_parts and date_parts[0]:
|
|
76
|
+
year = date_parts[0][0]
|
|
77
|
+
|
|
78
|
+
# Extract references
|
|
79
|
+
references = []
|
|
80
|
+
for ref in metadata.get("reference", []):
|
|
81
|
+
if ref.get("DOI"):
|
|
82
|
+
references.append(ref["DOI"])
|
|
83
|
+
|
|
84
|
+
# Container title (journal name)
|
|
85
|
+
container_titles = metadata.get("container-title", [])
|
|
86
|
+
journal = container_titles[0] if container_titles else None
|
|
87
|
+
|
|
88
|
+
# ISSN
|
|
89
|
+
issns = metadata.get("ISSN", [])
|
|
90
|
+
issn = issns[0] if issns else None
|
|
91
|
+
|
|
92
|
+
return cls(
|
|
93
|
+
doi=doi,
|
|
94
|
+
title=metadata.get("title", [None])[0] if metadata.get("title") else None,
|
|
95
|
+
authors=authors,
|
|
96
|
+
year=year,
|
|
97
|
+
journal=journal,
|
|
98
|
+
issn=issn,
|
|
99
|
+
volume=metadata.get("volume"),
|
|
100
|
+
issue=metadata.get("issue"),
|
|
101
|
+
page=metadata.get("page"),
|
|
102
|
+
publisher=metadata.get("publisher"),
|
|
103
|
+
type=metadata.get("type"),
|
|
104
|
+
abstract=metadata.get("abstract"),
|
|
105
|
+
url=metadata.get("URL"),
|
|
106
|
+
citation_count=metadata.get("is-referenced-by-count"),
|
|
107
|
+
references=references,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def to_dict(self) -> dict:
|
|
111
|
+
"""Convert to dictionary."""
|
|
112
|
+
return {
|
|
113
|
+
"doi": self.doi,
|
|
114
|
+
"title": self.title,
|
|
115
|
+
"authors": self.authors,
|
|
116
|
+
"year": self.year,
|
|
117
|
+
"journal": self.journal,
|
|
118
|
+
"issn": self.issn,
|
|
119
|
+
"volume": self.volume,
|
|
120
|
+
"issue": self.issue,
|
|
121
|
+
"page": self.page,
|
|
122
|
+
"publisher": self.publisher,
|
|
123
|
+
"type": self.type,
|
|
124
|
+
"abstract": self.abstract,
|
|
125
|
+
"url": self.url,
|
|
126
|
+
"citation_count": self.citation_count,
|
|
127
|
+
"references": self.references,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
def citation(self, style: str = "apa") -> str:
|
|
131
|
+
"""
|
|
132
|
+
Format as citation string.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
style: Citation style (currently only "apa" supported)
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Formatted citation string
|
|
139
|
+
"""
|
|
140
|
+
authors_str = ", ".join(self.authors[:3])
|
|
141
|
+
if len(self.authors) > 3:
|
|
142
|
+
authors_str += " et al."
|
|
143
|
+
|
|
144
|
+
year_str = f"({self.year})" if self.year else "(n.d.)"
|
|
145
|
+
title_str = self.title or "Untitled"
|
|
146
|
+
journal_str = f"*{self.journal}*" if self.journal else ""
|
|
147
|
+
|
|
148
|
+
parts = [authors_str, year_str, title_str]
|
|
149
|
+
if journal_str:
|
|
150
|
+
parts.append(journal_str)
|
|
151
|
+
if self.volume:
|
|
152
|
+
parts.append(f"{self.volume}")
|
|
153
|
+
if self.issue:
|
|
154
|
+
parts[-1] += f"({self.issue})"
|
|
155
|
+
if self.page:
|
|
156
|
+
parts.append(self.page)
|
|
157
|
+
parts.append(f"https://doi.org/{self.doi}")
|
|
158
|
+
|
|
159
|
+
return ". ".join(filter(None, parts))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@dataclass
|
|
163
|
+
class SearchResult:
|
|
164
|
+
"""
|
|
165
|
+
Container for search results with metadata.
|
|
166
|
+
|
|
167
|
+
Attributes:
|
|
168
|
+
works: List of Work objects
|
|
169
|
+
total: Total number of matches
|
|
170
|
+
query: Original search query
|
|
171
|
+
elapsed_ms: Search time in milliseconds
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
works: List[Work]
|
|
175
|
+
total: int
|
|
176
|
+
query: str
|
|
177
|
+
elapsed_ms: float
|
|
178
|
+
|
|
179
|
+
def __len__(self) -> int:
|
|
180
|
+
return len(self.works)
|
|
181
|
+
|
|
182
|
+
def __iter__(self):
|
|
183
|
+
return iter(self.works)
|
|
184
|
+
|
|
185
|
+
def __getitem__(self, idx):
|
|
186
|
+
return self.works[idx]
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crossref-local
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Local CrossRef database with 167M+ works and full-text search
|
|
5
|
+
Project-URL: Homepage, https://github.com/ywatanabe1989/crossref_local
|
|
6
|
+
Project-URL: Repository, https://github.com/ywatanabe1989/crossref_local
|
|
7
|
+
Author: Yusuke Watanabe
|
|
8
|
+
License-Expression: AGPL-3.0
|
|
9
|
+
Keywords: academic,citations,crossref,doi,fts5,full-text-search,impact-factor,scholarly
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
24
|
+
Provides-Extra: viz
|
|
25
|
+
Requires-Dist: matplotlib>=3.7; extra == 'viz'
|
|
26
|
+
Requires-Dist: networkx>=3.0; extra == 'viz'
|
|
27
|
+
Requires-Dist: pyvis>=0.3; extra == 'viz'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# CrossRef Local
|
|
31
|
+
|
|
32
|
+
Local CrossRef database with 167M+ scholarly works, full-text search, and impact factor calculation.
|
|
33
|
+
|
|
34
|
+
[](https://github.com/ywatanabe1989/crossref-local/actions/workflows/test.yml)
|
|
35
|
+
[](https://www.python.org/downloads/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<img src="examples/readme_figure.png" alt="CrossRef Local Demo" width="800"/>
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<details>
|
|
43
|
+
<summary><strong>Why CrossRef Local?</strong></summary>
|
|
44
|
+
|
|
45
|
+
**Built for the LLM era** - features that matter for AI research assistants:
|
|
46
|
+
|
|
47
|
+
| Feature | Benefit |
|
|
48
|
+
|---------|---------|
|
|
49
|
+
| 📝 **Abstracts** | Full text for semantic understanding |
|
|
50
|
+
| 📊 **Impact Factor** | Filter by journal quality |
|
|
51
|
+
| 🔗 **Citations** | Prioritize influential papers |
|
|
52
|
+
| ⚡ **Speed** | 167M records in ms, no rate limits |
|
|
53
|
+
|
|
54
|
+
Perfect for: RAG systems, research assistants, literature review automation.
|
|
55
|
+
|
|
56
|
+
</details>
|
|
57
|
+
|
|
58
|
+
<details>
|
|
59
|
+
<summary><strong>Installation</strong></summary>
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install crossref-local
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
From source:
|
|
66
|
+
```bash
|
|
67
|
+
git clone https://github.com/ywatanabe1989/crossref-local
|
|
68
|
+
cd crossref-local && make install
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Database setup (1.5 TB, ~2 weeks to build):
|
|
72
|
+
```bash
|
|
73
|
+
# 1. Download CrossRef data (~100GB compressed)
|
|
74
|
+
aria2c "https://academictorrents.com/details/..."
|
|
75
|
+
|
|
76
|
+
# 2. Build SQLite database (~days)
|
|
77
|
+
pip install dois2sqlite
|
|
78
|
+
dois2sqlite build /path/to/crossref-data ./data/crossref.db
|
|
79
|
+
|
|
80
|
+
# 3. Build FTS5 index (~60 hours) & citations table (~days)
|
|
81
|
+
make fts-build-screen
|
|
82
|
+
make citations-build-screen
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
</details>
|
|
86
|
+
|
|
87
|
+
<details>
|
|
88
|
+
<summary><strong>Python API</strong></summary>
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from crossref_local import search, get, count
|
|
92
|
+
|
|
93
|
+
# Full-text search (22ms for 541 matches across 167M records)
|
|
94
|
+
results = search("hippocampal sharp wave ripples")
|
|
95
|
+
for work in results:
|
|
96
|
+
print(f"{work.title} ({work.year})")
|
|
97
|
+
|
|
98
|
+
# Get by DOI
|
|
99
|
+
work = get("10.1126/science.aax0758")
|
|
100
|
+
print(work.citation())
|
|
101
|
+
|
|
102
|
+
# Count matches
|
|
103
|
+
n = count("machine learning") # 477,922 matches
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Async API:
|
|
107
|
+
```python
|
|
108
|
+
from crossref_local import aio
|
|
109
|
+
|
|
110
|
+
async def main():
|
|
111
|
+
counts = await aio.count_many(["CRISPR", "neural network", "climate"])
|
|
112
|
+
results = await aio.search("machine learning")
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
</details>
|
|
116
|
+
|
|
117
|
+
<details>
|
|
118
|
+
<summary><strong>CLI</strong></summary>
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
crossref-local search "CRISPR genome editing" -n 5
|
|
122
|
+
crossref-local get 10.1038/nature12373
|
|
123
|
+
crossref-local impact-factor Nature -y 2023 # IF: 54.067
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
With abstracts (`-a` flag):
|
|
127
|
+
```
|
|
128
|
+
$ crossref-local search "CRISPR" -n 1 -a
|
|
129
|
+
|
|
130
|
+
Found 87,473 matches in 18.2ms
|
|
131
|
+
|
|
132
|
+
1. RS-1 enhances CRISPR/Cas9- and TALEN-mediated knock-in efficiency (2016)
|
|
133
|
+
DOI: 10.1038/ncomms10548
|
|
134
|
+
Journal: Nature Communications
|
|
135
|
+
Abstract: Zinc-finger nuclease, transcription activator-like effector nuclease
|
|
136
|
+
and CRISPR/Cas9 are becoming major tools for genome editing. Importantly,
|
|
137
|
+
knock-in in several non-rodent species has been finally achieved...
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
</details>
|
|
141
|
+
|
|
142
|
+
<details>
|
|
143
|
+
<summary><strong>Impact Factor</strong></summary>
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from crossref_local.impact_factor import ImpactFactorCalculator
|
|
147
|
+
|
|
148
|
+
with ImpactFactorCalculator() as calc:
|
|
149
|
+
result = calc.calculate_impact_factor("Nature", target_year=2023)
|
|
150
|
+
print(f"IF: {result['impact_factor']:.3f}") # 54.067
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
| Journal | IF 2023 |
|
|
154
|
+
|---------|---------|
|
|
155
|
+
| Nature | 54.07 |
|
|
156
|
+
| Science | 46.17 |
|
|
157
|
+
| Cell | 54.01 |
|
|
158
|
+
| PLOS ONE | 3.37 |
|
|
159
|
+
|
|
160
|
+
</details>
|
|
161
|
+
|
|
162
|
+
<details>
|
|
163
|
+
<summary><strong>Citation Network</strong></summary>
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from crossref_local import get_citing, get_cited, CitationNetwork
|
|
167
|
+
|
|
168
|
+
citing = get_citing("10.1038/nature12373") # 1539 papers
|
|
169
|
+
cited = get_cited("10.1038/nature12373")
|
|
170
|
+
|
|
171
|
+
# Build visualization (like Connected Papers)
|
|
172
|
+
network = CitationNetwork("10.1038/nature12373", depth=2)
|
|
173
|
+
network.save_html("citation_network.html") # requires: pip install crossref-local[viz]
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
</details>
|
|
177
|
+
|
|
178
|
+
<details>
|
|
179
|
+
<summary><strong>Performance</strong></summary>
|
|
180
|
+
|
|
181
|
+
| Query | Matches | Time |
|
|
182
|
+
|-------|---------|------|
|
|
183
|
+
| `hippocampal sharp wave ripples` | 541 | 22ms |
|
|
184
|
+
| `machine learning` | 477,922 | 113ms |
|
|
185
|
+
| `CRISPR genome editing` | 12,170 | 257ms |
|
|
186
|
+
|
|
187
|
+
Searching 167M records in milliseconds via FTS5.
|
|
188
|
+
|
|
189
|
+
</details>
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
<p align="center">
|
|
195
|
+
<a href="https://scitex.ai"><img src="docs/scitex-icon-navy-inverted.png" alt="SciTeX" width="40"/></a>
|
|
196
|
+
<br>
|
|
197
|
+
AGPL-3.0 · ywatanabe@scitex.ai
|
|
198
|
+
</p>
|
|
199
|
+
|
|
200
|
+
<!-- EOF -->
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
crossref_local/__init__.py,sha256=vJFOk5somcdkuTT7UhVjAR4kj8p5JYVVHgmgCtTmENs,1544
|
|
2
|
+
crossref_local/aio.py,sha256=En2btSn3euRbEYav1919gsmdC8iQaMbgGUso-IThCwo,5490
|
|
3
|
+
crossref_local/api.py,sha256=FPZBStNLD7hnjjnpUzKfBFKo7gv3JHOLEIFSydH53bw,3370
|
|
4
|
+
crossref_local/citations.py,sha256=QFahv84upNnXP_89A8bHxEbAdz7wHbh5LEniGcAiHas,12402
|
|
5
|
+
crossref_local/cli.py,sha256=9ISMQSvWZHYrEO_gYXJB-Ju0lZCr9ifH8jFnltkgYTU,8639
|
|
6
|
+
crossref_local/config.py,sha256=_tHZOHBbbw5BcmkWdTlqbSffijlQZkPkexl2YxW4GmE,1980
|
|
7
|
+
crossref_local/db.py,sha256=xiTYFQxsIXmV6_QmXMWf6eX7GJv5D4icy67lBvQjUQI,3529
|
|
8
|
+
crossref_local/fts.py,sha256=e5mxAbHWrO9E1GvG1WFDFxoCs_4RQIdAPYLSuMf8JCM,3395
|
|
9
|
+
crossref_local/models.py,sha256=b_yYb91O6RwEPpEqe2Wmdz12WIfE5itjEus4-fCLxLI,5476
|
|
10
|
+
crossref_local/impact_factor/__init__.py,sha256=pcgVCPogBisANYE5Vp2PHVGPgxoMsSXr-6utqVE97-4,559
|
|
11
|
+
crossref_local/impact_factor/calculator.py,sha256=eZ13URAZzPdRyAQpS8zXe_T33e2lm_gQhtoJCXbfIGM,15977
|
|
12
|
+
crossref_local/impact_factor/journal_lookup.py,sha256=Ztx6ZeWxfmPvA3KfcW5h_yz01XPstIdk91j3nu2Q-qw,8846
|
|
13
|
+
crossref_local-0.3.0.dist-info/METADATA,sha256=3z5SlpYph6S6V6fwu5omXXyyNeGubZGvyqic8BU8-n8,5659
|
|
14
|
+
crossref_local-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
15
|
+
crossref_local-0.3.0.dist-info/entry_points.txt,sha256=TaFQ1y-tIym2dqgE6xUUeXTvy2uCHNKoYeRO4w6ndWQ,59
|
|
16
|
+
crossref_local-0.3.0.dist-info/RECORD,,
|