crossref-local 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crossref_local/__init__.py +78 -0
- crossref_local/aio.py +236 -0
- crossref_local/api.py +153 -0
- crossref_local/citations.py +413 -0
- crossref_local/cli.py +257 -0
- crossref_local/config.py +72 -0
- crossref_local/db.py +136 -0
- crossref_local/fts.py +138 -0
- crossref_local/impact_factor/__init__.py +20 -0
- crossref_local/impact_factor/calculator.py +479 -0
- crossref_local/impact_factor/journal_lookup.py +274 -0
- crossref_local/models.py +186 -0
- crossref_local-0.3.0.dist-info/METADATA +200 -0
- crossref_local-0.3.0.dist-info/RECORD +16 -0
- crossref_local-0.3.0.dist-info/WHEEL +4 -0
- crossref_local-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Impact Factor Calculator from CrossRef Local Database
|
|
5
|
+
|
|
6
|
+
Calculates journal impact factors by analyzing citation patterns
|
|
7
|
+
in the local CrossRef database.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import sqlite3
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, List, Optional, Tuple
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from .journal_lookup import JournalLookup
|
|
18
|
+
from ..config import Config
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ImpactFactorCalculator:
|
|
24
|
+
"""
|
|
25
|
+
Calculate journal impact factors from local CrossRef database.
|
|
26
|
+
|
|
27
|
+
Supports:
|
|
28
|
+
- 2-year and 5-year impact factors
|
|
29
|
+
- Moving averages
|
|
30
|
+
- Multiple calculation methods
|
|
31
|
+
- Journal identification by name or ISSN
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
35
|
+
"""
|
|
36
|
+
Initialize calculator with database connection.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
db_path: Path to CrossRef SQLite database. Auto-detects if None.
|
|
40
|
+
"""
|
|
41
|
+
if db_path is None:
|
|
42
|
+
self.db_path = Config.get_db_path()
|
|
43
|
+
else:
|
|
44
|
+
self.db_path = Path(db_path)
|
|
45
|
+
if not self.db_path.exists():
|
|
46
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
47
|
+
|
|
48
|
+
self.conn = None
|
|
49
|
+
self._connect()
|
|
50
|
+
self._journal_lookup = JournalLookup(str(self.db_path))
|
|
51
|
+
|
|
52
|
+
def _connect(self):
|
|
53
|
+
"""Establish database connection."""
|
|
54
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
55
|
+
self.conn.row_factory = sqlite3.Row
|
|
56
|
+
|
|
57
|
+
def close(self):
|
|
58
|
+
"""Close database connection."""
|
|
59
|
+
if self.conn:
|
|
60
|
+
self.conn.close()
|
|
61
|
+
if self._journal_lookup:
|
|
62
|
+
self._journal_lookup.close()
|
|
63
|
+
|
|
64
|
+
def __enter__(self):
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
68
|
+
self.close()
|
|
69
|
+
|
|
70
|
+
def get_journal_issn(self, journal_name: str) -> Optional[str]:
|
|
71
|
+
"""
|
|
72
|
+
Get ISSN for a journal name.
|
|
73
|
+
|
|
74
|
+
Uses the journals lookup table for fast resolution.
|
|
75
|
+
Falls back to slow query if table doesn't exist.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
journal_name: Journal name (e.g., "Nature")
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
ISSN string or None
|
|
82
|
+
"""
|
|
83
|
+
return self._journal_lookup.get_issn(journal_name)
|
|
84
|
+
|
|
85
|
+
def get_article_dois(
|
|
86
|
+
self,
|
|
87
|
+
journal_identifier: str,
|
|
88
|
+
year: int,
|
|
89
|
+
use_issn: bool = False,
|
|
90
|
+
citable_only: bool = True
|
|
91
|
+
) -> List[str]:
|
|
92
|
+
"""
|
|
93
|
+
Get DOIs for articles in a journal for a specific year.
|
|
94
|
+
|
|
95
|
+
Optimized: only fetches DOIs, not full metadata.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
journal_identifier: Journal name or ISSN
|
|
99
|
+
year: Publication year
|
|
100
|
+
use_issn: If True, search by ISSN instead of name
|
|
101
|
+
citable_only: If True, only return citable items (>20 references)
|
|
102
|
+
This matches JCR's definition of citable items.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of DOI strings
|
|
106
|
+
"""
|
|
107
|
+
# Citable items filter: research articles typically have >20 references
|
|
108
|
+
# This excludes news, editorials, letters, corrections, etc.
|
|
109
|
+
citable_filter = "AND json_array_length(json_extract(metadata, '$.reference')) > 20" if citable_only else ""
|
|
110
|
+
|
|
111
|
+
if use_issn:
|
|
112
|
+
query = f"""
|
|
113
|
+
SELECT doi
|
|
114
|
+
FROM works
|
|
115
|
+
WHERE json_extract(metadata, '$.ISSN[0]') = ?
|
|
116
|
+
AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
|
|
117
|
+
AND type = 'journal-article'
|
|
118
|
+
{citable_filter}
|
|
119
|
+
"""
|
|
120
|
+
params = (journal_identifier, year)
|
|
121
|
+
else:
|
|
122
|
+
query = f"""
|
|
123
|
+
SELECT doi
|
|
124
|
+
FROM works
|
|
125
|
+
WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
|
|
126
|
+
AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
|
|
127
|
+
AND type = 'journal-article'
|
|
128
|
+
{citable_filter}
|
|
129
|
+
"""
|
|
130
|
+
params = (f"%{journal_identifier}%", year)
|
|
131
|
+
|
|
132
|
+
cursor = self.conn.execute(query, params)
|
|
133
|
+
return [row[0] for row in cursor]
|
|
134
|
+
|
|
135
|
+
def count_articles(
|
|
136
|
+
self,
|
|
137
|
+
journal_identifier: str,
|
|
138
|
+
year: int,
|
|
139
|
+
use_issn: bool = False
|
|
140
|
+
) -> int:
|
|
141
|
+
"""
|
|
142
|
+
Count articles for a journal in a specific year.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
journal_identifier: Journal name or ISSN
|
|
146
|
+
year: Publication year
|
|
147
|
+
use_issn: If True, search by ISSN
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Number of articles
|
|
151
|
+
"""
|
|
152
|
+
if use_issn:
|
|
153
|
+
query = """
|
|
154
|
+
SELECT COUNT(*) as count
|
|
155
|
+
FROM works
|
|
156
|
+
WHERE json_extract(metadata, '$.ISSN[0]') = ?
|
|
157
|
+
AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
|
|
158
|
+
AND type = 'journal-article'
|
|
159
|
+
"""
|
|
160
|
+
params = (journal_identifier, year)
|
|
161
|
+
else:
|
|
162
|
+
query = """
|
|
163
|
+
SELECT COUNT(*) as count
|
|
164
|
+
FROM works
|
|
165
|
+
WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
|
|
166
|
+
AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
|
|
167
|
+
AND type = 'journal-article'
|
|
168
|
+
"""
|
|
169
|
+
params = (f"%{journal_identifier}%", year)
|
|
170
|
+
|
|
171
|
+
cursor = self.conn.execute(query, params)
|
|
172
|
+
result = cursor.fetchone()
|
|
173
|
+
return result[0] if result else 0
|
|
174
|
+
|
|
175
|
+
def get_citations_to_articles(
|
|
176
|
+
self,
|
|
177
|
+
dois: List[str],
|
|
178
|
+
citation_year: int,
|
|
179
|
+
method: str = "citations-table"
|
|
180
|
+
) -> int:
|
|
181
|
+
"""
|
|
182
|
+
Count citations to a list of DOIs in a specific year.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
dois: List of DOIs to check citations for
|
|
186
|
+
citation_year: Year when citations occurred
|
|
187
|
+
method: "citations-table" (fast, year-specific),
|
|
188
|
+
"is-referenced-by" (fast, cumulative),
|
|
189
|
+
"reference-graph" (slow, accurate)
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Total citation count
|
|
193
|
+
"""
|
|
194
|
+
if method == "citations-table":
|
|
195
|
+
return self._count_citations_from_table(dois, citation_year)
|
|
196
|
+
elif method == "is-referenced-by":
|
|
197
|
+
return self._count_citations_simple(dois, citation_year)
|
|
198
|
+
else:
|
|
199
|
+
return self._count_citations_from_graph(dois, citation_year)
|
|
200
|
+
|
|
201
|
+
def _count_citations_from_table(self, dois: List[str], citation_year: int) -> int:
|
|
202
|
+
"""
|
|
203
|
+
Fast citation count using citations table with indexed lookup.
|
|
204
|
+
|
|
205
|
+
Uses idx_citations_cited_new (cited_doi, citing_year) index.
|
|
206
|
+
"""
|
|
207
|
+
if not dois:
|
|
208
|
+
return 0
|
|
209
|
+
|
|
210
|
+
# Batch query for efficiency
|
|
211
|
+
placeholders = ','.join('?' * len(dois))
|
|
212
|
+
query = f"""
|
|
213
|
+
SELECT COUNT(*) as total
|
|
214
|
+
FROM citations
|
|
215
|
+
WHERE cited_doi IN ({placeholders})
|
|
216
|
+
AND citing_year = ?
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
params = dois + [citation_year]
|
|
220
|
+
cursor = self.conn.execute(query, params)
|
|
221
|
+
result = cursor.fetchone()
|
|
222
|
+
return result[0] if result and result[0] else 0
|
|
223
|
+
|
|
224
|
+
def _count_citations_simple(self, dois: List[str], citation_year: int) -> int:
|
|
225
|
+
"""
|
|
226
|
+
Use is-referenced-by-count field (current citations only).
|
|
227
|
+
|
|
228
|
+
Note: This gives current total citations, not year-specific.
|
|
229
|
+
For accurate year-by-year IF, use reference-graph method.
|
|
230
|
+
"""
|
|
231
|
+
if not dois:
|
|
232
|
+
return 0
|
|
233
|
+
|
|
234
|
+
# Create placeholders for DOIs
|
|
235
|
+
placeholders = ','.join('?' * len(dois))
|
|
236
|
+
query = f"""
|
|
237
|
+
SELECT SUM(CAST(json_extract(metadata, '$.is-referenced-by-count') AS INTEGER)) as total
|
|
238
|
+
FROM works
|
|
239
|
+
WHERE doi IN ({placeholders})
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
cursor = self.conn.execute(query, dois)
|
|
243
|
+
result = cursor.fetchone()
|
|
244
|
+
return result[0] if result and result[0] else 0
|
|
245
|
+
|
|
246
|
+
def _count_citations_from_graph(self, dois: List[str], citation_year: int) -> int:
|
|
247
|
+
"""
|
|
248
|
+
Count citations by building citation graph from reference fields.
|
|
249
|
+
|
|
250
|
+
This is more accurate as it respects citation year.
|
|
251
|
+
"""
|
|
252
|
+
if not dois:
|
|
253
|
+
return 0
|
|
254
|
+
|
|
255
|
+
# Create a set for fast lookup
|
|
256
|
+
target_dois = set(doi.lower() for doi in dois)
|
|
257
|
+
citation_count = 0
|
|
258
|
+
|
|
259
|
+
# Query articles published in citation_year
|
|
260
|
+
logger.info(f" Querying articles with references published in {citation_year}...")
|
|
261
|
+
query = """
|
|
262
|
+
SELECT metadata
|
|
263
|
+
FROM works
|
|
264
|
+
WHERE json_extract(metadata, '$.published.date-parts[0][0]') = ?
|
|
265
|
+
AND json_extract(metadata, '$.reference') IS NOT NULL
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
cursor = self.conn.execute(query, (citation_year,))
|
|
269
|
+
|
|
270
|
+
articles_checked = 0
|
|
271
|
+
for row in cursor:
|
|
272
|
+
articles_checked += 1
|
|
273
|
+
if articles_checked % 1000 == 0:
|
|
274
|
+
logger.info(f" Checked {articles_checked} articles, found {citation_count} citations so far...")
|
|
275
|
+
|
|
276
|
+
metadata = json.loads(row['metadata'])
|
|
277
|
+
references = metadata.get('reference', [])
|
|
278
|
+
|
|
279
|
+
# Check if any reference DOI matches our target DOIs
|
|
280
|
+
for ref in references:
|
|
281
|
+
ref_doi = ref.get('DOI', '').lower()
|
|
282
|
+
if ref_doi in target_dois:
|
|
283
|
+
citation_count += 1
|
|
284
|
+
|
|
285
|
+
logger.info(f" Checked {articles_checked} total articles with references")
|
|
286
|
+
return citation_count
|
|
287
|
+
|
|
288
|
+
def calculate_impact_factor(
|
|
289
|
+
self,
|
|
290
|
+
journal_identifier: str,
|
|
291
|
+
target_year: int,
|
|
292
|
+
window_years: int = 2,
|
|
293
|
+
use_issn: bool = False,
|
|
294
|
+
method: str = "citations-table",
|
|
295
|
+
citable_only: bool = True
|
|
296
|
+
) -> Dict:
|
|
297
|
+
"""
|
|
298
|
+
Calculate impact factor for a journal.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
journal_identifier: Journal name or ISSN
|
|
302
|
+
target_year: Year for which to calculate IF
|
|
303
|
+
window_years: Citation window (2 for 2-year IF, 5 for 5-year IF)
|
|
304
|
+
use_issn: Use ISSN for journal identification
|
|
305
|
+
method: "citations-table" (fast), "is-referenced-by", or "reference-graph"
|
|
306
|
+
citable_only: If True, only count citable items (research articles with >20 refs)
|
|
307
|
+
This matches JCR methodology. Default True.
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Dictionary with calculation results
|
|
311
|
+
"""
|
|
312
|
+
logger.info(f"Calculating {window_years}-year IF for {journal_identifier} in {target_year}")
|
|
313
|
+
|
|
314
|
+
# If journal name provided, convert to ISSN for faster queries
|
|
315
|
+
if not use_issn:
|
|
316
|
+
logger.info(f"Looking up ISSN for journal: {journal_identifier}")
|
|
317
|
+
issn = self.get_journal_issn(journal_identifier)
|
|
318
|
+
if issn:
|
|
319
|
+
logger.info(f"Found ISSN: {issn} - using for faster queries")
|
|
320
|
+
journal_identifier = issn
|
|
321
|
+
use_issn = True
|
|
322
|
+
else:
|
|
323
|
+
logger.warning(f"Could not find ISSN for {journal_identifier}, using journal name (slower)")
|
|
324
|
+
|
|
325
|
+
# Get articles published in the window years
|
|
326
|
+
window_start = target_year - window_years
|
|
327
|
+
window_end = target_year - 1
|
|
328
|
+
|
|
329
|
+
logger.info(f"Fetching DOIs from {window_start} to {window_end}...")
|
|
330
|
+
all_dois = []
|
|
331
|
+
articles_by_year = {}
|
|
332
|
+
|
|
333
|
+
for year in range(window_start, window_end + 1):
|
|
334
|
+
dois = self.get_article_dois(journal_identifier, year, use_issn, citable_only)
|
|
335
|
+
articles_by_year[year] = len(dois)
|
|
336
|
+
all_dois.extend(dois)
|
|
337
|
+
logger.info(f" {year}: {len(dois)} {'citable items' if citable_only else 'articles'}")
|
|
338
|
+
|
|
339
|
+
total_articles = len(all_dois)
|
|
340
|
+
logger.info(f"Total articles in window: {total_articles}")
|
|
341
|
+
|
|
342
|
+
if total_articles == 0:
|
|
343
|
+
logger.warning(f"No articles found for {journal_identifier} in {window_start}-{window_end}")
|
|
344
|
+
return {
|
|
345
|
+
'journal': journal_identifier,
|
|
346
|
+
'target_year': target_year,
|
|
347
|
+
'window_years': window_years,
|
|
348
|
+
'window_range': f"{window_start}-{window_end}",
|
|
349
|
+
'articles_by_year': articles_by_year,
|
|
350
|
+
'total_articles': 0,
|
|
351
|
+
'total_citations': 0,
|
|
352
|
+
'impact_factor': 0.0,
|
|
353
|
+
'method': method,
|
|
354
|
+
'status': 'no_articles'
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
# Count citations to these articles in target_year
|
|
358
|
+
logger.info(f"Counting citations to {total_articles} articles in {target_year} (method: {method})...")
|
|
359
|
+
total_citations = self.get_citations_to_articles(
|
|
360
|
+
all_dois, target_year, method
|
|
361
|
+
)
|
|
362
|
+
logger.info(f"Found {total_citations} citations")
|
|
363
|
+
|
|
364
|
+
# Calculate IF
|
|
365
|
+
impact_factor = total_citations / total_articles if total_articles > 0 else 0.0
|
|
366
|
+
|
|
367
|
+
logger.info(f"IF = {total_citations} / {total_articles} = {impact_factor:.3f}")
|
|
368
|
+
|
|
369
|
+
return {
|
|
370
|
+
'journal': journal_identifier,
|
|
371
|
+
'target_year': target_year,
|
|
372
|
+
'window_years': window_years,
|
|
373
|
+
'window_range': f"{window_start}-{window_end}",
|
|
374
|
+
'articles_by_year': articles_by_year,
|
|
375
|
+
'total_articles': total_articles,
|
|
376
|
+
'total_citations': total_citations,
|
|
377
|
+
'impact_factor': impact_factor,
|
|
378
|
+
'method': method,
|
|
379
|
+
'citable_only': citable_only,
|
|
380
|
+
'status': 'success'
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
def calculate_if_time_series(
|
|
384
|
+
self,
|
|
385
|
+
journal_identifier: str,
|
|
386
|
+
start_year: int,
|
|
387
|
+
end_year: int,
|
|
388
|
+
window_years: int = 2,
|
|
389
|
+
use_issn: bool = False,
|
|
390
|
+
method: str = "is-referenced-by"
|
|
391
|
+
) -> List[Dict]:
|
|
392
|
+
"""
|
|
393
|
+
Calculate impact factor time series.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
journal_identifier: Journal name or ISSN
|
|
397
|
+
start_year: First year to calculate
|
|
398
|
+
end_year: Last year to calculate
|
|
399
|
+
window_years: Citation window
|
|
400
|
+
use_issn: Use ISSN for identification
|
|
401
|
+
method: Citation counting method
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
List of IF calculation results by year
|
|
405
|
+
"""
|
|
406
|
+
results = []
|
|
407
|
+
|
|
408
|
+
for year in range(start_year, end_year + 1):
|
|
409
|
+
result = self.calculate_impact_factor(
|
|
410
|
+
journal_identifier,
|
|
411
|
+
year,
|
|
412
|
+
window_years,
|
|
413
|
+
use_issn,
|
|
414
|
+
method
|
|
415
|
+
)
|
|
416
|
+
results.append(result)
|
|
417
|
+
|
|
418
|
+
return results
|
|
419
|
+
|
|
420
|
+
def calculate_moving_average(
|
|
421
|
+
self,
|
|
422
|
+
if_time_series: List[Dict],
|
|
423
|
+
window: int = 3
|
|
424
|
+
) -> List[Dict]:
|
|
425
|
+
"""
|
|
426
|
+
Calculate moving average of impact factors.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
if_time_series: List of IF results from calculate_if_time_series
|
|
430
|
+
window: Moving average window size
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
List with added moving_average field
|
|
434
|
+
"""
|
|
435
|
+
import numpy as np
|
|
436
|
+
|
|
437
|
+
# Extract IF values
|
|
438
|
+
if_values = [r['impact_factor'] for r in if_time_series]
|
|
439
|
+
|
|
440
|
+
# Calculate moving average
|
|
441
|
+
if len(if_values) >= window:
|
|
442
|
+
ma_values = np.convolve(if_values, np.ones(window)/window, mode='valid')
|
|
443
|
+
|
|
444
|
+
# Pad with None for years where MA can't be calculated
|
|
445
|
+
padding = [None] * (window - 1)
|
|
446
|
+
ma_values = padding + list(ma_values)
|
|
447
|
+
else:
|
|
448
|
+
ma_values = [None] * len(if_values)
|
|
449
|
+
|
|
450
|
+
# Add to results
|
|
451
|
+
for result, ma_value in zip(if_time_series, ma_values):
|
|
452
|
+
result['moving_average'] = ma_value
|
|
453
|
+
|
|
454
|
+
return if_time_series
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
if __name__ == "__main__":
|
|
458
|
+
# Example usage
|
|
459
|
+
logging.basicConfig(level=logging.INFO)
|
|
460
|
+
|
|
461
|
+
with ImpactFactorCalculator() as calc:
|
|
462
|
+
# Test: Calculate IF for Nature in 2023
|
|
463
|
+
result = calc.calculate_impact_factor(
|
|
464
|
+
journal_identifier="Nature",
|
|
465
|
+
target_year=2023,
|
|
466
|
+
window_years=2,
|
|
467
|
+
method="is-referenced-by"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
print("\n" + "="*60)
|
|
471
|
+
print(f"Journal: {result['journal']}")
|
|
472
|
+
print(f"Target Year: {result['target_year']}")
|
|
473
|
+
print(f"Window: {result['window_range']}")
|
|
474
|
+
print(f"Articles: {result['total_articles']}")
|
|
475
|
+
print(f"Citations: {result['total_citations']}")
|
|
476
|
+
print(f"Impact Factor: {result['impact_factor']:.3f}")
|
|
477
|
+
print("="*60)
|
|
478
|
+
|
|
479
|
+
# EOF
|