crossref-local 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,479 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Impact Factor Calculator from CrossRef Local Database
5
+
6
+ Calculates journal impact factors by analyzing citation patterns
7
+ in the local CrossRef database.
8
+ """
9
+
10
+ import json
11
+ import sqlite3
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional, Tuple
14
+ from collections import defaultdict
15
+ import logging
16
+
17
+ from .journal_lookup import JournalLookup
18
+ from ..config import Config
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class ImpactFactorCalculator:
24
+ """
25
+ Calculate journal impact factors from local CrossRef database.
26
+
27
+ Supports:
28
+ - 2-year and 5-year impact factors
29
+ - Moving averages
30
+ - Multiple calculation methods
31
+ - Journal identification by name or ISSN
32
+ """
33
+
34
+ def __init__(self, db_path: Optional[str] = None):
35
+ """
36
+ Initialize calculator with database connection.
37
+
38
+ Args:
39
+ db_path: Path to CrossRef SQLite database. Auto-detects if None.
40
+ """
41
+ if db_path is None:
42
+ self.db_path = Config.get_db_path()
43
+ else:
44
+ self.db_path = Path(db_path)
45
+ if not self.db_path.exists():
46
+ raise FileNotFoundError(f"Database not found: {db_path}")
47
+
48
+ self.conn = None
49
+ self._connect()
50
+ self._journal_lookup = JournalLookup(str(self.db_path))
51
+
52
+ def _connect(self):
53
+ """Establish database connection."""
54
+ self.conn = sqlite3.connect(self.db_path)
55
+ self.conn.row_factory = sqlite3.Row
56
+
57
+ def close(self):
58
+ """Close database connection."""
59
+ if self.conn:
60
+ self.conn.close()
61
+ if self._journal_lookup:
62
+ self._journal_lookup.close()
63
+
64
+ def __enter__(self):
65
+ return self
66
+
67
+ def __exit__(self, exc_type, exc_val, exc_tb):
68
+ self.close()
69
+
70
+ def get_journal_issn(self, journal_name: str) -> Optional[str]:
71
+ """
72
+ Get ISSN for a journal name.
73
+
74
+ Uses the journals lookup table for fast resolution.
75
+ Falls back to slow query if table doesn't exist.
76
+
77
+ Args:
78
+ journal_name: Journal name (e.g., "Nature")
79
+
80
+ Returns:
81
+ ISSN string or None
82
+ """
83
+ return self._journal_lookup.get_issn(journal_name)
84
+
85
+ def get_article_dois(
86
+ self,
87
+ journal_identifier: str,
88
+ year: int,
89
+ use_issn: bool = False,
90
+ citable_only: bool = True
91
+ ) -> List[str]:
92
+ """
93
+ Get DOIs for articles in a journal for a specific year.
94
+
95
+ Optimized: only fetches DOIs, not full metadata.
96
+
97
+ Args:
98
+ journal_identifier: Journal name or ISSN
99
+ year: Publication year
100
+ use_issn: If True, search by ISSN instead of name
101
+ citable_only: If True, only return citable items (>20 references)
102
+ This matches JCR's definition of citable items.
103
+
104
+ Returns:
105
+ List of DOI strings
106
+ """
107
+ # Citable items filter: research articles typically have >20 references
108
+ # This excludes news, editorials, letters, corrections, etc.
109
+ citable_filter = "AND json_array_length(json_extract(metadata, '$.reference')) > 20" if citable_only else ""
110
+
111
+ if use_issn:
112
+ query = f"""
113
+ SELECT doi
114
+ FROM works
115
+ WHERE json_extract(metadata, '$.ISSN[0]') = ?
116
+ AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
117
+ AND type = 'journal-article'
118
+ {citable_filter}
119
+ """
120
+ params = (journal_identifier, year)
121
+ else:
122
+ query = f"""
123
+ SELECT doi
124
+ FROM works
125
+ WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
126
+ AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
127
+ AND type = 'journal-article'
128
+ {citable_filter}
129
+ """
130
+ params = (f"%{journal_identifier}%", year)
131
+
132
+ cursor = self.conn.execute(query, params)
133
+ return [row[0] for row in cursor]
134
+
135
+ def count_articles(
136
+ self,
137
+ journal_identifier: str,
138
+ year: int,
139
+ use_issn: bool = False
140
+ ) -> int:
141
+ """
142
+ Count articles for a journal in a specific year.
143
+
144
+ Args:
145
+ journal_identifier: Journal name or ISSN
146
+ year: Publication year
147
+ use_issn: If True, search by ISSN
148
+
149
+ Returns:
150
+ Number of articles
151
+ """
152
+ if use_issn:
153
+ query = """
154
+ SELECT COUNT(*) as count
155
+ FROM works
156
+ WHERE json_extract(metadata, '$.ISSN[0]') = ?
157
+ AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
158
+ AND type = 'journal-article'
159
+ """
160
+ params = (journal_identifier, year)
161
+ else:
162
+ query = """
163
+ SELECT COUNT(*) as count
164
+ FROM works
165
+ WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
166
+ AND json_extract(metadata, '$.published.date-parts[0][0]') = ?
167
+ AND type = 'journal-article'
168
+ """
169
+ params = (f"%{journal_identifier}%", year)
170
+
171
+ cursor = self.conn.execute(query, params)
172
+ result = cursor.fetchone()
173
+ return result[0] if result else 0
174
+
175
+ def get_citations_to_articles(
176
+ self,
177
+ dois: List[str],
178
+ citation_year: int,
179
+ method: str = "citations-table"
180
+ ) -> int:
181
+ """
182
+ Count citations to a list of DOIs in a specific year.
183
+
184
+ Args:
185
+ dois: List of DOIs to check citations for
186
+ citation_year: Year when citations occurred
187
+ method: "citations-table" (fast, year-specific),
188
+ "is-referenced-by" (fast, cumulative),
189
+ "reference-graph" (slow, accurate)
190
+
191
+ Returns:
192
+ Total citation count
193
+ """
194
+ if method == "citations-table":
195
+ return self._count_citations_from_table(dois, citation_year)
196
+ elif method == "is-referenced-by":
197
+ return self._count_citations_simple(dois, citation_year)
198
+ else:
199
+ return self._count_citations_from_graph(dois, citation_year)
200
+
201
+ def _count_citations_from_table(self, dois: List[str], citation_year: int) -> int:
202
+ """
203
+ Fast citation count using citations table with indexed lookup.
204
+
205
+ Uses idx_citations_cited_new (cited_doi, citing_year) index.
206
+ """
207
+ if not dois:
208
+ return 0
209
+
210
+ # Batch query for efficiency
211
+ placeholders = ','.join('?' * len(dois))
212
+ query = f"""
213
+ SELECT COUNT(*) as total
214
+ FROM citations
215
+ WHERE cited_doi IN ({placeholders})
216
+ AND citing_year = ?
217
+ """
218
+
219
+ params = dois + [citation_year]
220
+ cursor = self.conn.execute(query, params)
221
+ result = cursor.fetchone()
222
+ return result[0] if result and result[0] else 0
223
+
224
+ def _count_citations_simple(self, dois: List[str], citation_year: int) -> int:
225
+ """
226
+ Use is-referenced-by-count field (current citations only).
227
+
228
+ Note: This gives current total citations, not year-specific.
229
+ For accurate year-by-year IF, use reference-graph method.
230
+ """
231
+ if not dois:
232
+ return 0
233
+
234
+ # Create placeholders for DOIs
235
+ placeholders = ','.join('?' * len(dois))
236
+ query = f"""
237
+ SELECT SUM(CAST(json_extract(metadata, '$.is-referenced-by-count') AS INTEGER)) as total
238
+ FROM works
239
+ WHERE doi IN ({placeholders})
240
+ """
241
+
242
+ cursor = self.conn.execute(query, dois)
243
+ result = cursor.fetchone()
244
+ return result[0] if result and result[0] else 0
245
+
246
+ def _count_citations_from_graph(self, dois: List[str], citation_year: int) -> int:
247
+ """
248
+ Count citations by building citation graph from reference fields.
249
+
250
+ This is more accurate as it respects citation year.
251
+ """
252
+ if not dois:
253
+ return 0
254
+
255
+ # Create a set for fast lookup
256
+ target_dois = set(doi.lower() for doi in dois)
257
+ citation_count = 0
258
+
259
+ # Query articles published in citation_year
260
+ logger.info(f" Querying articles with references published in {citation_year}...")
261
+ query = """
262
+ SELECT metadata
263
+ FROM works
264
+ WHERE json_extract(metadata, '$.published.date-parts[0][0]') = ?
265
+ AND json_extract(metadata, '$.reference') IS NOT NULL
266
+ """
267
+
268
+ cursor = self.conn.execute(query, (citation_year,))
269
+
270
+ articles_checked = 0
271
+ for row in cursor:
272
+ articles_checked += 1
273
+ if articles_checked % 1000 == 0:
274
+ logger.info(f" Checked {articles_checked} articles, found {citation_count} citations so far...")
275
+
276
+ metadata = json.loads(row['metadata'])
277
+ references = metadata.get('reference', [])
278
+
279
+ # Check if any reference DOI matches our target DOIs
280
+ for ref in references:
281
+ ref_doi = ref.get('DOI', '').lower()
282
+ if ref_doi in target_dois:
283
+ citation_count += 1
284
+
285
+ logger.info(f" Checked {articles_checked} total articles with references")
286
+ return citation_count
287
+
288
+ def calculate_impact_factor(
289
+ self,
290
+ journal_identifier: str,
291
+ target_year: int,
292
+ window_years: int = 2,
293
+ use_issn: bool = False,
294
+ method: str = "citations-table",
295
+ citable_only: bool = True
296
+ ) -> Dict:
297
+ """
298
+ Calculate impact factor for a journal.
299
+
300
+ Args:
301
+ journal_identifier: Journal name or ISSN
302
+ target_year: Year for which to calculate IF
303
+ window_years: Citation window (2 for 2-year IF, 5 for 5-year IF)
304
+ use_issn: Use ISSN for journal identification
305
+ method: "citations-table" (fast), "is-referenced-by", or "reference-graph"
306
+ citable_only: If True, only count citable items (research articles with >20 refs)
307
+ This matches JCR methodology. Default True.
308
+
309
+ Returns:
310
+ Dictionary with calculation results
311
+ """
312
+ logger.info(f"Calculating {window_years}-year IF for {journal_identifier} in {target_year}")
313
+
314
+ # If journal name provided, convert to ISSN for faster queries
315
+ if not use_issn:
316
+ logger.info(f"Looking up ISSN for journal: {journal_identifier}")
317
+ issn = self.get_journal_issn(journal_identifier)
318
+ if issn:
319
+ logger.info(f"Found ISSN: {issn} - using for faster queries")
320
+ journal_identifier = issn
321
+ use_issn = True
322
+ else:
323
+ logger.warning(f"Could not find ISSN for {journal_identifier}, using journal name (slower)")
324
+
325
+ # Get articles published in the window years
326
+ window_start = target_year - window_years
327
+ window_end = target_year - 1
328
+
329
+ logger.info(f"Fetching DOIs from {window_start} to {window_end}...")
330
+ all_dois = []
331
+ articles_by_year = {}
332
+
333
+ for year in range(window_start, window_end + 1):
334
+ dois = self.get_article_dois(journal_identifier, year, use_issn, citable_only)
335
+ articles_by_year[year] = len(dois)
336
+ all_dois.extend(dois)
337
+ logger.info(f" {year}: {len(dois)} {'citable items' if citable_only else 'articles'}")
338
+
339
+ total_articles = len(all_dois)
340
+ logger.info(f"Total articles in window: {total_articles}")
341
+
342
+ if total_articles == 0:
343
+ logger.warning(f"No articles found for {journal_identifier} in {window_start}-{window_end}")
344
+ return {
345
+ 'journal': journal_identifier,
346
+ 'target_year': target_year,
347
+ 'window_years': window_years,
348
+ 'window_range': f"{window_start}-{window_end}",
349
+ 'articles_by_year': articles_by_year,
350
+ 'total_articles': 0,
351
+ 'total_citations': 0,
352
+ 'impact_factor': 0.0,
353
+ 'method': method,
354
+ 'status': 'no_articles'
355
+ }
356
+
357
+ # Count citations to these articles in target_year
358
+ logger.info(f"Counting citations to {total_articles} articles in {target_year} (method: {method})...")
359
+ total_citations = self.get_citations_to_articles(
360
+ all_dois, target_year, method
361
+ )
362
+ logger.info(f"Found {total_citations} citations")
363
+
364
+ # Calculate IF
365
+ impact_factor = total_citations / total_articles if total_articles > 0 else 0.0
366
+
367
+ logger.info(f"IF = {total_citations} / {total_articles} = {impact_factor:.3f}")
368
+
369
+ return {
370
+ 'journal': journal_identifier,
371
+ 'target_year': target_year,
372
+ 'window_years': window_years,
373
+ 'window_range': f"{window_start}-{window_end}",
374
+ 'articles_by_year': articles_by_year,
375
+ 'total_articles': total_articles,
376
+ 'total_citations': total_citations,
377
+ 'impact_factor': impact_factor,
378
+ 'method': method,
379
+ 'citable_only': citable_only,
380
+ 'status': 'success'
381
+ }
382
+
383
+ def calculate_if_time_series(
384
+ self,
385
+ journal_identifier: str,
386
+ start_year: int,
387
+ end_year: int,
388
+ window_years: int = 2,
389
+ use_issn: bool = False,
390
+ method: str = "is-referenced-by"
391
+ ) -> List[Dict]:
392
+ """
393
+ Calculate impact factor time series.
394
+
395
+ Args:
396
+ journal_identifier: Journal name or ISSN
397
+ start_year: First year to calculate
398
+ end_year: Last year to calculate
399
+ window_years: Citation window
400
+ use_issn: Use ISSN for identification
401
+ method: Citation counting method
402
+
403
+ Returns:
404
+ List of IF calculation results by year
405
+ """
406
+ results = []
407
+
408
+ for year in range(start_year, end_year + 1):
409
+ result = self.calculate_impact_factor(
410
+ journal_identifier,
411
+ year,
412
+ window_years,
413
+ use_issn,
414
+ method
415
+ )
416
+ results.append(result)
417
+
418
+ return results
419
+
420
+ def calculate_moving_average(
421
+ self,
422
+ if_time_series: List[Dict],
423
+ window: int = 3
424
+ ) -> List[Dict]:
425
+ """
426
+ Calculate moving average of impact factors.
427
+
428
+ Args:
429
+ if_time_series: List of IF results from calculate_if_time_series
430
+ window: Moving average window size
431
+
432
+ Returns:
433
+ List with added moving_average field
434
+ """
435
+ import numpy as np
436
+
437
+ # Extract IF values
438
+ if_values = [r['impact_factor'] for r in if_time_series]
439
+
440
+ # Calculate moving average
441
+ if len(if_values) >= window:
442
+ ma_values = np.convolve(if_values, np.ones(window)/window, mode='valid')
443
+
444
+ # Pad with None for years where MA can't be calculated
445
+ padding = [None] * (window - 1)
446
+ ma_values = padding + list(ma_values)
447
+ else:
448
+ ma_values = [None] * len(if_values)
449
+
450
+ # Add to results
451
+ for result, ma_value in zip(if_time_series, ma_values):
452
+ result['moving_average'] = ma_value
453
+
454
+ return if_time_series
455
+
456
+
457
+ if __name__ == "__main__":
458
+ # Example usage
459
+ logging.basicConfig(level=logging.INFO)
460
+
461
+ with ImpactFactorCalculator() as calc:
462
+ # Test: Calculate IF for Nature in 2023
463
+ result = calc.calculate_impact_factor(
464
+ journal_identifier="Nature",
465
+ target_year=2023,
466
+ window_years=2,
467
+ method="is-referenced-by"
468
+ )
469
+
470
+ print("\n" + "="*60)
471
+ print(f"Journal: {result['journal']}")
472
+ print(f"Target Year: {result['target_year']}")
473
+ print(f"Window: {result['window_range']}")
474
+ print(f"Articles: {result['total_articles']}")
475
+ print(f"Citations: {result['total_citations']}")
476
+ print(f"Impact Factor: {result['impact_factor']:.3f}")
477
+ print("="*60)
478
+
479
+ # EOF