crossref-local 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Journal lookup module for fast name-to-ISSN resolution.
4
+
5
+ Uses OpenAlex journals table (222k journals with IF proxy) for fast lookups.
6
+ Falls back to direct database query if table doesn't exist.
7
+ """
8
+
9
+ import json
10
+ import sqlite3
11
+ from typing import Dict, List, Optional
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class JournalLookup:
18
+ """
19
+ Fast journal name to ISSN lookup.
20
+
21
+ Uses journals_openalex table for O(1) lookups with IF proxy data.
22
+ Falls back to slow works table scan if OpenAlex table doesn't exist.
23
+ """
24
+
25
+ def __init__(self, db_path: str):
26
+ """
27
+ Initialize journal lookup.
28
+
29
+ Args:
30
+ db_path: Path to CrossRef SQLite database
31
+ """
32
+ self.db_path = db_path
33
+ self.conn = sqlite3.connect(db_path)
34
+ self.conn.row_factory = sqlite3.Row
35
+ self._openalex_exists = self._check_openalex_table()
36
+ self._issn_lookup_exists = self._check_issn_lookup_table()
37
+
38
+ if self._openalex_exists:
39
+ logger.info("Using journals_openalex table for fast lookups")
40
+ else:
41
+ logger.warning(
42
+ "journals_openalex table not found. "
43
+ "Run download_openalex_journals.py for fast lookups. "
44
+ "Falling back to slow query."
45
+ )
46
+
47
+ def _check_openalex_table(self) -> bool:
48
+ """Check if OpenAlex journals table exists."""
49
+ cursor = self.conn.execute("""
50
+ SELECT name FROM sqlite_master
51
+ WHERE type='table' AND name='journals_openalex'
52
+ """)
53
+ return cursor.fetchone() is not None
54
+
55
+ def _check_issn_lookup_table(self) -> bool:
56
+ """Check if ISSN lookup table exists."""
57
+ cursor = self.conn.execute("""
58
+ SELECT name FROM sqlite_master
59
+ WHERE type='table' AND name='issn_lookup'
60
+ """)
61
+ return cursor.fetchone() is not None
62
+
63
+ def get_issn(self, journal_name: str, strict: bool = True) -> Optional[str]:
64
+ """
65
+ Get ISSN for a journal name.
66
+
67
+ Args:
68
+ journal_name: Journal name (case-insensitive)
69
+ strict: If True, only exact matches. If False, allow partial matches.
70
+
71
+ Returns:
72
+ ISSN string or None if not found
73
+ """
74
+ if self._openalex_exists:
75
+ return self._get_issn_openalex(journal_name, strict)
76
+ else:
77
+ return self._get_issn_slow(journal_name, strict)
78
+
79
+ def _get_issn_openalex(self, journal_name: str, strict: bool = True) -> Optional[str]:
80
+ """Fast lookup using OpenAlex journals table."""
81
+ # Try exact match first
82
+ cursor = self.conn.execute("""
83
+ SELECT issn_l FROM journals_openalex
84
+ WHERE name_lower = ?
85
+ LIMIT 1
86
+ """, (journal_name.lower(),))
87
+
88
+ result = cursor.fetchone()
89
+ if result and result[0]:
90
+ return result[0]
91
+
92
+ # If strict mode, don't try partial match
93
+ if strict:
94
+ logger.debug(f"Strict mode: no exact match for '{journal_name}'")
95
+ return None
96
+
97
+ # Try partial match (only if not strict)
98
+ logger.warning(f"Using partial match for '{journal_name}' - results may be inaccurate")
99
+ cursor = self.conn.execute("""
100
+ SELECT issn_l, name FROM journals_openalex
101
+ WHERE name_lower LIKE ?
102
+ ORDER BY works_count DESC
103
+ LIMIT 1
104
+ """, (f"%{journal_name.lower()}%",))
105
+
106
+ result = cursor.fetchone()
107
+ if result and result[0]:
108
+ logger.warning(f" Matched to: '{result[1]}'")
109
+ return result[0]
110
+ return None
111
+
112
+ def _get_issn_slow(self, journal_name: str, strict: bool = True) -> Optional[str]:
113
+ """Slow lookup by scanning works table."""
114
+ if strict:
115
+ # Exact match
116
+ cursor = self.conn.execute("""
117
+ SELECT DISTINCT json_extract(metadata, '$.ISSN[0]') as issn
118
+ FROM works
119
+ WHERE json_extract(metadata, '$.container-title[0]') = ?
120
+ AND json_extract(metadata, '$.ISSN[0]') IS NOT NULL
121
+ LIMIT 1
122
+ """, (journal_name,))
123
+ else:
124
+ # Partial match
125
+ cursor = self.conn.execute("""
126
+ SELECT DISTINCT json_extract(metadata, '$.ISSN[0]') as issn
127
+ FROM works
128
+ WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
129
+ AND json_extract(metadata, '$.ISSN[0]') IS NOT NULL
130
+ LIMIT 1
131
+ """, (f"%{journal_name}%",))
132
+
133
+ result = cursor.fetchone()
134
+ return result[0] if result else None
135
+
136
+ def search(self, query: str, limit: int = 10) -> List[Dict]:
137
+ """
138
+ Search for journals by name.
139
+
140
+ Args:
141
+ query: Search query (partial name match)
142
+ limit: Maximum results to return
143
+
144
+ Returns:
145
+ List of journal info dictionaries with IF proxy
146
+ """
147
+ if not self._openalex_exists:
148
+ return []
149
+
150
+ cursor = self.conn.execute("""
151
+ SELECT name, issn_l, publisher, works_count,
152
+ two_year_mean_citedness, h_index
153
+ FROM journals_openalex
154
+ WHERE name_lower LIKE ?
155
+ ORDER BY works_count DESC
156
+ LIMIT ?
157
+ """, (f"%{query.lower()}%", limit))
158
+
159
+ return [
160
+ {
161
+ "name": row["name"],
162
+ "issn": row["issn_l"],
163
+ "publisher": row["publisher"],
164
+ "works_count": row["works_count"],
165
+ "if_proxy": row["two_year_mean_citedness"],
166
+ "h_index": row["h_index"]
167
+ }
168
+ for row in cursor.fetchall()
169
+ ]
170
+
171
+ def get_info(self, issn: str) -> Optional[Dict]:
172
+ """
173
+ Get journal info by ISSN.
174
+
175
+ Args:
176
+ issn: Journal ISSN
177
+
178
+ Returns:
179
+ Journal info dictionary with IF proxy or None
180
+ """
181
+ if not self._openalex_exists:
182
+ return None
183
+
184
+ # Try direct ISSN-L match
185
+ cursor = self.conn.execute("""
186
+ SELECT name, issn_l, issns, publisher, works_count,
187
+ two_year_mean_citedness, h_index, is_oa
188
+ FROM journals_openalex
189
+ WHERE issn_l = ?
190
+ LIMIT 1
191
+ """, (issn,))
192
+
193
+ row = cursor.fetchone()
194
+
195
+ # If not found, try issn_lookup table
196
+ if not row and self._issn_lookup_exists:
197
+ cursor = self.conn.execute("""
198
+ SELECT jo.name, jo.issn_l, jo.issns, jo.publisher, jo.works_count,
199
+ jo.two_year_mean_citedness, jo.h_index, jo.is_oa
200
+ FROM issn_lookup il
201
+ JOIN journals_openalex jo ON il.journal_id = jo.id
202
+ WHERE il.issn = ?
203
+ LIMIT 1
204
+ """, (issn,))
205
+ row = cursor.fetchone()
206
+
207
+ if row:
208
+ issns = []
209
+ if row["issns"]:
210
+ try:
211
+ issns = json.loads(row["issns"])
212
+ except:
213
+ pass
214
+ return {
215
+ "name": row["name"],
216
+ "issn": row["issn_l"],
217
+ "issns": issns,
218
+ "publisher": row["publisher"],
219
+ "works_count": row["works_count"],
220
+ "if_proxy": row["two_year_mean_citedness"],
221
+ "h_index": row["h_index"],
222
+ "is_oa": row["is_oa"]
223
+ }
224
+ return None
225
+
226
+ def get_if_proxy(self, journal_name: str, strict: bool = True) -> Optional[float]:
227
+ """
228
+ Get OpenAlex Impact Factor proxy for a journal.
229
+
230
+ Args:
231
+ journal_name: Journal name
232
+ strict: If True, only exact matches
233
+
234
+ Returns:
235
+ 2-year mean citedness (IF proxy) or None
236
+ """
237
+ if not self._openalex_exists:
238
+ return None
239
+
240
+ # Try exact match
241
+ cursor = self.conn.execute("""
242
+ SELECT two_year_mean_citedness FROM journals_openalex
243
+ WHERE name_lower = ?
244
+ LIMIT 1
245
+ """, (journal_name.lower(),))
246
+
247
+ result = cursor.fetchone()
248
+ if result and result[0]:
249
+ return result[0]
250
+
251
+ if strict:
252
+ return None
253
+
254
+ # Try partial match (only if not strict)
255
+ cursor = self.conn.execute("""
256
+ SELECT two_year_mean_citedness FROM journals_openalex
257
+ WHERE name_lower LIKE ?
258
+ ORDER BY works_count DESC
259
+ LIMIT 1
260
+ """, (f"%{journal_name.lower()}%",))
261
+
262
+ result = cursor.fetchone()
263
+ return result[0] if result and result[0] else None
264
+
265
+ def close(self):
266
+ """Close database connection."""
267
+ if self.conn:
268
+ self.conn.close()
269
+
270
+ def __enter__(self):
271
+ return self
272
+
273
+ def __exit__(self, exc_type, exc_val, exc_tb):
274
+ self.close()
@@ -0,0 +1,202 @@
1
+ """MCP server for CrossRef Local - Claude integration.
2
+
3
+ This server exposes crossref-local functionality as MCP tools,
4
+ enabling Claude Desktop and other MCP clients to search academic papers.
5
+
6
+ Usage:
7
+ crossref-local serve # stdio (Claude Desktop)
8
+ crossref-local serve -t http --port 8082 # HTTP transport
9
+ crossref-local-mcp # Direct entry point
10
+ """
11
+
12
+ import json
13
+ from typing import Optional
14
+
15
+ from fastmcp import FastMCP
16
+
17
+ from . import search, get, count, info, __version__
18
+ from .impact_factor import ImpactFactorCalculator
19
+
20
+ # Initialize MCP server
21
+ mcp = FastMCP(
22
+ name="crossref-local",
23
+ instructions="Local CrossRef database with 167M+ works and full-text search. "
24
+ "Use search_works to find papers, get_work for DOI lookup, count_works for counts, "
25
+ "database_info for stats, and calculate_impact_factor for journal metrics.",
26
+ )
27
+
28
+
29
+ @mcp.tool()
30
+ def search_works(
31
+ query: str,
32
+ limit: int = 10,
33
+ offset: int = 0,
34
+ with_abstracts: bool = False,
35
+ ) -> str:
36
+ """Search for academic works by title, abstract, or authors.
37
+
38
+ Uses FTS5 full-text search index for fast searching across 167M+ papers.
39
+ Supports FTS5 query syntax: AND, OR, NOT, "exact phrases".
40
+
41
+ Args:
42
+ query: Search query (e.g., "machine learning", "CRISPR", "neural network AND hippocampus")
43
+ limit: Maximum number of results to return (default: 10, max: 100)
44
+ offset: Skip first N results for pagination (default: 0)
45
+ with_abstracts: Include abstracts in results (default: False)
46
+
47
+ Returns:
48
+ JSON string with search results including total count and matching works.
49
+
50
+ Examples:
51
+ search_works("machine learning")
52
+ search_works("CRISPR", limit=20)
53
+ search_works("neural network AND memory", with_abstracts=True)
54
+ """
55
+ results = search(query, limit=min(limit, 100), offset=offset)
56
+
57
+ works_data = []
58
+ for work in results.works:
59
+ work_dict = {
60
+ "doi": work.doi,
61
+ "title": work.title,
62
+ "authors": work.authors,
63
+ "year": work.year,
64
+ "journal": work.journal,
65
+ }
66
+ if with_abstracts and work.abstract:
67
+ work_dict["abstract"] = work.abstract
68
+ works_data.append(work_dict)
69
+
70
+ return json.dumps(
71
+ {
72
+ "query": results.query,
73
+ "total": results.total,
74
+ "returned": len(works_data),
75
+ "elapsed_ms": round(results.elapsed_ms, 2),
76
+ "works": works_data,
77
+ },
78
+ indent=2,
79
+ )
80
+
81
+
82
+ @mcp.tool()
83
+ def get_work(doi: str, as_citation: bool = False) -> str:
84
+ """Get detailed information about a work by DOI.
85
+
86
+ Args:
87
+ doi: Digital Object Identifier (e.g., "10.1038/nature12373")
88
+ as_citation: Return formatted citation instead of full metadata
89
+
90
+ Returns:
91
+ JSON string with work metadata, or formatted citation string.
92
+
93
+ Examples:
94
+ get_work("10.1038/nature12373")
95
+ get_work("10.1126/science.aax0758", as_citation=True)
96
+ """
97
+ work = get(doi)
98
+
99
+ if work is None:
100
+ return json.dumps({"error": f"DOI not found: {doi}"})
101
+
102
+ if as_citation:
103
+ return work.citation()
104
+
105
+ return json.dumps(work.to_dict(), indent=2)
106
+
107
+
108
+ @mcp.tool()
109
+ def count_works(query: str) -> str:
110
+ """Count matching works without fetching results.
111
+
112
+ Faster than search when you only need the count.
113
+
114
+ Args:
115
+ query: FTS5 search query
116
+
117
+ Returns:
118
+ JSON string with count.
119
+
120
+ Examples:
121
+ count_works("CRISPR")
122
+ count_works("machine learning AND deep")
123
+ """
124
+ n = count(query)
125
+ return json.dumps({"query": query, "count": n})
126
+
127
+
128
+ @mcp.tool()
129
+ def database_info() -> str:
130
+ """Get database statistics and status.
131
+
132
+ Returns:
133
+ JSON string with database path, work count, FTS index count, and citation count.
134
+ """
135
+ db_info = info()
136
+ return json.dumps(db_info, indent=2)
137
+
138
+
139
+ @mcp.tool()
140
+ def calculate_impact_factor(
141
+ journal: str,
142
+ year: int = 2023,
143
+ window: int = 2,
144
+ ) -> str:
145
+ """Calculate impact factor for a journal.
146
+
147
+ Impact factor = citations in target year / articles in window years.
148
+
149
+ Args:
150
+ journal: Journal name or ISSN (e.g., "Nature", "Science", "0028-0836")
151
+ year: Target year for citation count (default: 2023)
152
+ window: Number of years for article window (default: 2 for standard IF)
153
+
154
+ Returns:
155
+ JSON string with journal name, article count, citation count, and impact factor.
156
+
157
+ Examples:
158
+ calculate_impact_factor("Nature")
159
+ calculate_impact_factor("Science", year=2022)
160
+ calculate_impact_factor("Cell", window=5) # 5-year impact factor
161
+ """
162
+ try:
163
+ with ImpactFactorCalculator() as calc:
164
+ result = calc.calculate_impact_factor(
165
+ journal_identifier=journal,
166
+ target_year=year,
167
+ window_years=window,
168
+ )
169
+ return json.dumps(result, indent=2)
170
+ except Exception as e:
171
+ return json.dumps({"error": str(e)})
172
+
173
+
174
+ def run_server(
175
+ transport: str = "stdio",
176
+ host: str = "localhost",
177
+ port: int = 8082,
178
+ ) -> None:
179
+ """Run the MCP server.
180
+
181
+ Args:
182
+ transport: Transport protocol ("stdio", "sse", or "http")
183
+ host: Host for HTTP/SSE transport
184
+ port: Port for HTTP/SSE transport
185
+ """
186
+ if transport == "stdio":
187
+ mcp.run(transport="stdio")
188
+ elif transport == "sse":
189
+ mcp.run(transport="sse", host=host, port=port)
190
+ elif transport == "http":
191
+ mcp.run(transport="streamable-http", host=host, port=port)
192
+ else:
193
+ raise ValueError(f"Unknown transport: {transport}")
194
+
195
+
196
+ def main():
197
+ """Entry point for crossref-local-mcp command."""
198
+ run_server(transport="stdio")
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()
@@ -0,0 +1,186 @@
1
+ """Data models for crossref_local."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional
5
+ import json
6
+
7
+
8
+ @dataclass
9
+ class Work:
10
+ """
11
+ Represents a scholarly work from CrossRef.
12
+
13
+ Attributes:
14
+ doi: Digital Object Identifier
15
+ title: Work title
16
+ authors: List of author names
17
+ year: Publication year
18
+ journal: Journal/container title
19
+ issn: Journal ISSN
20
+ volume: Volume number
21
+ issue: Issue number
22
+ page: Page range
23
+ publisher: Publisher name
24
+ type: Work type (journal-article, book-chapter, etc.)
25
+ abstract: Abstract text (if available)
26
+ url: Resource URL
27
+ citation_count: Number of citations (if available)
28
+ references: List of reference DOIs
29
+ """
30
+
31
+ doi: str
32
+ title: Optional[str] = None
33
+ authors: List[str] = field(default_factory=list)
34
+ year: Optional[int] = None
35
+ journal: Optional[str] = None
36
+ issn: Optional[str] = None
37
+ volume: Optional[str] = None
38
+ issue: Optional[str] = None
39
+ page: Optional[str] = None
40
+ publisher: Optional[str] = None
41
+ type: Optional[str] = None
42
+ abstract: Optional[str] = None
43
+ url: Optional[str] = None
44
+ citation_count: Optional[int] = None
45
+ references: List[str] = field(default_factory=list)
46
+
47
+ @classmethod
48
+ def from_metadata(cls, doi: str, metadata: dict) -> "Work":
49
+ """
50
+ Create Work from CrossRef metadata JSON.
51
+
52
+ Args:
53
+ doi: DOI string
54
+ metadata: CrossRef metadata dictionary
55
+
56
+ Returns:
57
+ Work instance
58
+ """
59
+ # Extract authors
60
+ authors = []
61
+ for author in metadata.get("author", []):
62
+ given = author.get("given", "")
63
+ family = author.get("family", "")
64
+ if given and family:
65
+ authors.append(f"{given} {family}")
66
+ elif family:
67
+ authors.append(family)
68
+ elif author.get("name"):
69
+ authors.append(author["name"])
70
+
71
+ # Extract year from published date
72
+ year = None
73
+ published = metadata.get("published", {})
74
+ date_parts = published.get("date-parts", [[]])
75
+ if date_parts and date_parts[0]:
76
+ year = date_parts[0][0]
77
+
78
+ # Extract references
79
+ references = []
80
+ for ref in metadata.get("reference", []):
81
+ if ref.get("DOI"):
82
+ references.append(ref["DOI"])
83
+
84
+ # Container title (journal name)
85
+ container_titles = metadata.get("container-title", [])
86
+ journal = container_titles[0] if container_titles else None
87
+
88
+ # ISSN
89
+ issns = metadata.get("ISSN", [])
90
+ issn = issns[0] if issns else None
91
+
92
+ return cls(
93
+ doi=doi,
94
+ title=metadata.get("title", [None])[0] if metadata.get("title") else None,
95
+ authors=authors,
96
+ year=year,
97
+ journal=journal,
98
+ issn=issn,
99
+ volume=metadata.get("volume"),
100
+ issue=metadata.get("issue"),
101
+ page=metadata.get("page"),
102
+ publisher=metadata.get("publisher"),
103
+ type=metadata.get("type"),
104
+ abstract=metadata.get("abstract"),
105
+ url=metadata.get("URL"),
106
+ citation_count=metadata.get("is-referenced-by-count"),
107
+ references=references,
108
+ )
109
+
110
+ def to_dict(self) -> dict:
111
+ """Convert to dictionary."""
112
+ return {
113
+ "doi": self.doi,
114
+ "title": self.title,
115
+ "authors": self.authors,
116
+ "year": self.year,
117
+ "journal": self.journal,
118
+ "issn": self.issn,
119
+ "volume": self.volume,
120
+ "issue": self.issue,
121
+ "page": self.page,
122
+ "publisher": self.publisher,
123
+ "type": self.type,
124
+ "abstract": self.abstract,
125
+ "url": self.url,
126
+ "citation_count": self.citation_count,
127
+ "references": self.references,
128
+ }
129
+
130
+ def citation(self, style: str = "apa") -> str:
131
+ """
132
+ Format as citation string.
133
+
134
+ Args:
135
+ style: Citation style (currently only "apa" supported)
136
+
137
+ Returns:
138
+ Formatted citation string
139
+ """
140
+ authors_str = ", ".join(self.authors[:3])
141
+ if len(self.authors) > 3:
142
+ authors_str += " et al."
143
+
144
+ year_str = f"({self.year})" if self.year else "(n.d.)"
145
+ title_str = self.title or "Untitled"
146
+ journal_str = f"*{self.journal}*" if self.journal else ""
147
+
148
+ parts = [authors_str, year_str, title_str]
149
+ if journal_str:
150
+ parts.append(journal_str)
151
+ if self.volume:
152
+ parts.append(f"{self.volume}")
153
+ if self.issue:
154
+ parts[-1] += f"({self.issue})"
155
+ if self.page:
156
+ parts.append(self.page)
157
+ parts.append(f"https://doi.org/{self.doi}")
158
+
159
+ return ". ".join(filter(None, parts))
160
+
161
+
162
+ @dataclass
163
+ class SearchResult:
164
+ """
165
+ Container for search results with metadata.
166
+
167
+ Attributes:
168
+ works: List of Work objects
169
+ total: Total number of matches
170
+ query: Original search query
171
+ elapsed_ms: Search time in milliseconds
172
+ """
173
+
174
+ works: List[Work]
175
+ total: int
176
+ query: str
177
+ elapsed_ms: float
178
+
179
+ def __len__(self) -> int:
180
+ return len(self.works)
181
+
182
+ def __iter__(self):
183
+ return iter(self.works)
184
+
185
+ def __getitem__(self, idx):
186
+ return self.works[idx]