crossref-local 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ """
2
+ crossref_local - Local CrossRef database with full-text search.
3
+
4
+ A Python package for querying a local mirror of the CrossRef database
5
+ with 167M+ scholarly works, full-text search, and impact factor calculation.
6
+
7
+ Quick Start
8
+ -----------
9
+
10
+ Sync usage:
11
+ >>> from crossref_local import search, get
12
+ >>> results = search("hippocampal sharp wave ripples")
13
+ >>> work = get("10.1126/science.aax0758")
14
+
15
+ Async usage:
16
+ >>> from crossref_local import aio
17
+ >>> results = await aio.search("machine learning")
18
+ >>> counts = await aio.count_many(["CRISPR", "neural network"])
19
+
20
+ Configuration
21
+ -------------
22
+
23
+ Local mode (direct database access):
24
+ >>> from crossref_local import configure
25
+ >>> configure("/path/to/crossref.db")
26
+ Or set CROSSREF_LOCAL_DB environment variable.
27
+
28
+ Remote mode (API access via HTTP):
29
+ >>> from crossref_local import configure_remote
30
+ >>> configure_remote("http://localhost:3333")
31
+ Or set CROSSREF_LOCAL_API environment variable.
32
+
33
+ Typical setup with SSH tunnel:
34
+ $ ssh -L 3333:127.0.0.1:3333 nas # In terminal
35
+ >>> configure_remote() # Uses default localhost:3333
36
+
37
+ Public API
38
+ ----------
39
+
40
+ Functions:
41
+ search(query, limit, offset) -> SearchResult
42
+ count(query) -> int
43
+ get(doi) -> Work | None
44
+ get_many(dois) -> list[Work]
45
+ exists(doi) -> bool
46
+ configure(db_path) -> None
47
+ configure_remote(api_url) -> None
48
+ get_mode() -> str
49
+ info() -> dict
50
+
51
+ Citation functions:
52
+ get_citing(doi) -> list[str]
53
+ get_cited(doi) -> list[str]
54
+ get_citation_count(doi) -> int
55
+
56
+ Classes:
57
+ Work - Scholarly work with title, authors, DOI, etc.
58
+ SearchResult - Container for search results
59
+ CitationNetwork - Citation graph builder and visualizer
60
+
61
+ Modules:
62
+ aio - Async versions of all API functions
63
+ """
64
+
65
+ __version__ = "0.3.1"
66
+
67
+ # Core API (public functions)
68
+ from .api import (
69
+ search,
70
+ count,
71
+ get,
72
+ get_many,
73
+ exists,
74
+ configure,
75
+ configure_remote,
76
+ get_mode,
77
+ info,
78
+ )
79
+
80
+ # Models (public classes)
81
+ from .models import Work, SearchResult
82
+
83
+ # Async API (public module)
84
+ from . import aio
85
+
86
+ # Citation network (public functions and classes)
87
+ from .citations import get_citing, get_cited, get_citation_count, CitationNetwork
88
+
89
+
90
+ # Public API - what users should import
91
+ __all__ = [
92
+ # Version
93
+ "__version__",
94
+ # Core search/retrieval
95
+ "search",
96
+ "count",
97
+ "get",
98
+ "get_many",
99
+ "exists",
100
+ # Configuration
101
+ "configure",
102
+ "configure_remote",
103
+ "get_mode",
104
+ "info",
105
+ # Data models
106
+ "Work",
107
+ "SearchResult",
108
+ # Async API
109
+ "aio",
110
+ # Citation network
111
+ "get_citing",
112
+ "get_cited",
113
+ "get_citation_count",
114
+ "CitationNetwork",
115
+ ]
116
+
117
+
118
+ # ============================================================================
119
+ # Advanced / Internal APIs (not in __all__, but importable if needed)
120
+ # ============================================================================
121
+ # These are exposed for advanced users but not part of the stable public API.
122
+ # Use at your own risk - they may change without notice.
123
+ #
124
+ # from crossref_local.db import Database, connection
125
+ # from crossref_local.config import Config
126
+ # from crossref_local.remote import RemoteClient
127
+ # from crossref_local.fts import search_dois
128
+ # ============================================================================
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m crossref_local."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
crossref_local/aio.py ADDED
@@ -0,0 +1,236 @@
1
+ """
2
+ Async API for crossref_local.
3
+
4
+ Provides async versions of all API functions. Uses thread pool execution
5
+ with per-thread database connections for thread safety.
6
+
7
+ Usage:
8
+ from crossref_local import aio
9
+
10
+ async def main():
11
+ results = await aio.search("machine learning")
12
+ work = await aio.get("10.1038/nature12373")
13
+ n = await aio.count("CRISPR")
14
+
15
+ # Or import individual functions
16
+ from crossref_local.aio import search, get, count
17
+
18
+ # Concurrent operations
19
+ counts = await aio.count_many(["CRISPR", "machine learning"])
20
+ """
21
+
22
+ import asyncio
23
+ import threading
24
+ from typing import List, Optional
25
+
26
+ from .models import Work, SearchResult
27
+ from .config import Config
28
+ from .db import Database
29
+
30
+
31
+ # Thread-local storage for database connections
32
+ _thread_local = threading.local()
33
+
34
+
35
+ def _get_thread_db() -> Database:
36
+ """Get thread-local database connection."""
37
+ if not hasattr(_thread_local, 'db'):
38
+ _thread_local.db = Database(Config.get_db_path())
39
+ return _thread_local.db
40
+
41
+
42
+ def _search_sync(query: str, limit: int, offset: int) -> SearchResult:
43
+ """Thread-safe sync search."""
44
+ from . import fts
45
+ # Use thread-local DB
46
+ db = _get_thread_db()
47
+ return fts._search_with_db(db, query, limit, offset)
48
+
49
+
50
+ def _count_sync(query: str) -> int:
51
+ """Thread-safe sync count."""
52
+ from . import fts
53
+ db = _get_thread_db()
54
+ return fts._count_with_db(db, query)
55
+
56
+
57
+ def _get_sync(doi: str) -> Optional[Work]:
58
+ """Thread-safe sync get."""
59
+ db = _get_thread_db()
60
+ metadata = db.get_metadata(doi)
61
+ if metadata:
62
+ return Work.from_metadata(doi, metadata)
63
+ return None
64
+
65
+
66
+ def _get_many_sync(dois: List[str]) -> List[Work]:
67
+ """Thread-safe sync get_many."""
68
+ db = _get_thread_db()
69
+ works = []
70
+ for doi in dois:
71
+ metadata = db.get_metadata(doi)
72
+ if metadata:
73
+ works.append(Work.from_metadata(doi, metadata))
74
+ return works
75
+
76
+
77
+ def _exists_sync(doi: str) -> bool:
78
+ """Thread-safe sync exists."""
79
+ db = _get_thread_db()
80
+ row = db.fetchone("SELECT 1 FROM works WHERE doi = ?", (doi,))
81
+ return row is not None
82
+
83
+
84
+ def _info_sync() -> dict:
85
+ """Thread-safe sync info."""
86
+ db = _get_thread_db()
87
+
88
+ row = db.fetchone("SELECT COUNT(*) as count FROM works")
89
+ work_count = row["count"] if row else 0
90
+
91
+ try:
92
+ row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
93
+ fts_count = row["count"] if row else 0
94
+ except Exception:
95
+ fts_count = 0
96
+
97
+ try:
98
+ row = db.fetchone("SELECT COUNT(*) as count FROM citations")
99
+ citation_count = row["count"] if row else 0
100
+ except Exception:
101
+ citation_count = 0
102
+
103
+ return {
104
+ "db_path": str(Config.get_db_path()),
105
+ "works": work_count,
106
+ "fts_indexed": fts_count,
107
+ "citations": citation_count,
108
+ }
109
+
110
+
111
+ async def search(
112
+ query: str,
113
+ limit: int = 10,
114
+ offset: int = 0,
115
+ ) -> SearchResult:
116
+ """
117
+ Async full-text search across works.
118
+
119
+ Args:
120
+ query: Search query (supports FTS5 syntax)
121
+ limit: Maximum results to return
122
+ offset: Skip first N results (for pagination)
123
+
124
+ Returns:
125
+ SearchResult with matching works
126
+ """
127
+ return await asyncio.to_thread(_search_sync, query, limit, offset)
128
+
129
+
130
+ async def count(query: str) -> int:
131
+ """
132
+ Async count matching works without fetching results.
133
+
134
+ Args:
135
+ query: FTS5 search query
136
+
137
+ Returns:
138
+ Number of matching works
139
+ """
140
+ return await asyncio.to_thread(_count_sync, query)
141
+
142
+
143
+ async def get(doi: str) -> Optional[Work]:
144
+ """
145
+ Async get a work by DOI.
146
+
147
+ Args:
148
+ doi: Digital Object Identifier
149
+
150
+ Returns:
151
+ Work object or None if not found
152
+ """
153
+ return await asyncio.to_thread(_get_sync, doi)
154
+
155
+
156
+ async def get_many(dois: List[str]) -> List[Work]:
157
+ """
158
+ Async get multiple works by DOI.
159
+
160
+ Args:
161
+ dois: List of DOIs
162
+
163
+ Returns:
164
+ List of Work objects (missing DOIs are skipped)
165
+ """
166
+ return await asyncio.to_thread(_get_many_sync, dois)
167
+
168
+
169
+ async def exists(doi: str) -> bool:
170
+ """
171
+ Async check if a DOI exists in the database.
172
+
173
+ Args:
174
+ doi: Digital Object Identifier
175
+
176
+ Returns:
177
+ True if DOI exists
178
+ """
179
+ return await asyncio.to_thread(_exists_sync, doi)
180
+
181
+
182
+ async def info() -> dict:
183
+ """
184
+ Async get database information.
185
+
186
+ Returns:
187
+ Dictionary with database stats
188
+ """
189
+ return await asyncio.to_thread(_info_sync)
190
+
191
+
192
+ async def search_many(queries: List[str], limit: int = 10) -> List[SearchResult]:
193
+ """
194
+ Run multiple searches concurrently.
195
+
196
+ Args:
197
+ queries: List of search queries
198
+ limit: Maximum results per query
199
+
200
+ Returns:
201
+ List of SearchResult objects
202
+ """
203
+ tasks = [search(q, limit=limit) for q in queries]
204
+ return await asyncio.gather(*tasks)
205
+
206
+
207
+ async def count_many(queries: List[str]) -> dict:
208
+ """
209
+ Count matches for multiple queries concurrently.
210
+
211
+ Args:
212
+ queries: List of search queries
213
+
214
+ Returns:
215
+ Dict mapping query -> count
216
+
217
+ Example:
218
+ >>> counts = await count_many(["CRISPR", "machine learning"])
219
+ >>> print(counts)
220
+ {'CRISPR': 45000, 'machine learning': 477922}
221
+ """
222
+ tasks = [count(q) for q in queries]
223
+ results = await asyncio.gather(*tasks)
224
+ return dict(zip(queries, results))
225
+
226
+
227
+ __all__ = [
228
+ "search",
229
+ "count",
230
+ "get",
231
+ "get_many",
232
+ "exists",
233
+ "info",
234
+ "search_many",
235
+ "count_many",
236
+ ]
crossref_local/api.py ADDED
@@ -0,0 +1,221 @@
1
+ """Main API for crossref_local.
2
+
3
+ Supports two modes:
4
+ - local: Direct database access (requires database file)
5
+ - remote: HTTP API access (requires API server)
6
+
7
+ Mode is auto-detected or can be set explicitly via:
8
+ - CROSSREF_LOCAL_MODE environment variable ("local" or "remote")
9
+ - CROSSREF_LOCAL_API environment variable (API URL)
10
+ - configure() or configure_remote() functions
11
+ """
12
+
13
+ from typing import List, Optional
14
+
15
+ from .config import Config
16
+ from .db import get_db, close_db
17
+ from .models import Work, SearchResult
18
+ from . import fts
19
+
20
+
21
+ def _get_remote_client():
22
+ """Get remote client (lazy import to avoid circular dependency)."""
23
+ from .remote import RemoteClient
24
+
25
+ return RemoteClient(Config.get_api_url())
26
+
27
+
28
+ def search(
29
+ query: str,
30
+ limit: int = 10,
31
+ offset: int = 0,
32
+ ) -> SearchResult:
33
+ """
34
+ Full-text search across works.
35
+
36
+ Uses FTS5 index for fast searching across titles, abstracts, and authors.
37
+
38
+ Args:
39
+ query: Search query (supports FTS5 syntax)
40
+ limit: Maximum results to return
41
+ offset: Skip first N results (for pagination)
42
+
43
+ Returns:
44
+ SearchResult with matching works
45
+
46
+ Example:
47
+ >>> from crossref_local import search
48
+ >>> results = search("machine learning")
49
+ >>> print(f"Found {results.total} matches")
50
+ """
51
+ if Config.get_mode() == "remote":
52
+ client = _get_remote_client()
53
+ return client.search(query=query, limit=limit)
54
+ return fts.search(query, limit, offset)
55
+
56
+
57
+ def count(query: str) -> int:
58
+ """
59
+ Count matching works without fetching results.
60
+
61
+ Args:
62
+ query: FTS5 search query
63
+
64
+ Returns:
65
+ Number of matching works
66
+ """
67
+ if Config.get_mode() == "remote":
68
+ client = _get_remote_client()
69
+ result = client.search(query=query, limit=1)
70
+ return result.total
71
+ return fts.count(query)
72
+
73
+
74
+ def get(doi: str) -> Optional[Work]:
75
+ """
76
+ Get a work by DOI.
77
+
78
+ Args:
79
+ doi: Digital Object Identifier
80
+
81
+ Returns:
82
+ Work object or None if not found
83
+
84
+ Example:
85
+ >>> from crossref_local import get
86
+ >>> work = get("10.1038/nature12373")
87
+ >>> print(work.title)
88
+ """
89
+ if Config.get_mode() == "remote":
90
+ client = _get_remote_client()
91
+ return client.get(doi)
92
+ db = get_db()
93
+ metadata = db.get_metadata(doi)
94
+ if metadata:
95
+ return Work.from_metadata(doi, metadata)
96
+ return None
97
+
98
+
99
+ def get_many(dois: List[str]) -> List[Work]:
100
+ """
101
+ Get multiple works by DOI.
102
+
103
+ Args:
104
+ dois: List of DOIs
105
+
106
+ Returns:
107
+ List of Work objects (missing DOIs are skipped)
108
+ """
109
+ if Config.get_mode() == "remote":
110
+ client = _get_remote_client()
111
+ return client.get_many(dois)
112
+ db = get_db()
113
+ works = []
114
+ for doi in dois:
115
+ metadata = db.get_metadata(doi)
116
+ if metadata:
117
+ works.append(Work.from_metadata(doi, metadata))
118
+ return works
119
+
120
+
121
+ def exists(doi: str) -> bool:
122
+ """
123
+ Check if a DOI exists in the database.
124
+
125
+ Args:
126
+ doi: Digital Object Identifier
127
+
128
+ Returns:
129
+ True if DOI exists
130
+ """
131
+ if Config.get_mode() == "remote":
132
+ client = _get_remote_client()
133
+ return client.exists(doi)
134
+ db = get_db()
135
+ row = db.fetchone("SELECT 1 FROM works WHERE doi = ?", (doi,))
136
+ return row is not None
137
+
138
+
139
+ def configure(db_path: str) -> None:
140
+ """
141
+ Configure for local database access.
142
+
143
+ Args:
144
+ db_path: Path to CrossRef SQLite database
145
+
146
+ Example:
147
+ >>> from crossref_local import configure
148
+ >>> configure("/path/to/crossref.db")
149
+ """
150
+ Config.set_db_path(db_path)
151
+ close_db() # Reset singleton to use new path
152
+
153
+
154
+ def configure_remote(api_url: str = "http://localhost:3333") -> None:
155
+ """
156
+ Configure for remote API access.
157
+
158
+ Args:
159
+ api_url: URL of CrossRef Local API server
160
+
161
+ Example:
162
+ >>> from crossref_local import configure_remote
163
+ >>> configure_remote("http://localhost:3333")
164
+ >>> # Or via SSH tunnel:
165
+ >>> # ssh -L 3333:127.0.0.1:3333 nas
166
+ >>> configure_remote() # Uses default localhost:3333
167
+ """
168
+ Config.set_api_url(api_url)
169
+
170
+
171
+ def get_mode() -> str:
172
+ """
173
+ Get current mode.
174
+
175
+ Returns:
176
+ "local" or "remote"
177
+ """
178
+ return Config.get_mode()
179
+
180
+
181
+ def info() -> dict:
182
+ """
183
+ Get database/API information.
184
+
185
+ Returns:
186
+ Dictionary with database stats and mode info
187
+ """
188
+ mode = Config.get_mode()
189
+
190
+ if mode == "remote":
191
+ client = _get_remote_client()
192
+ remote_info = client.info()
193
+ return {"mode": "remote", **remote_info}
194
+
195
+ db = get_db()
196
+
197
+ # Get work count
198
+ row = db.fetchone("SELECT COUNT(*) as count FROM works")
199
+ work_count = row["count"] if row else 0
200
+
201
+ # Get FTS count
202
+ try:
203
+ row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
204
+ fts_count = row["count"] if row else 0
205
+ except Exception:
206
+ fts_count = 0
207
+
208
+ # Get citations count
209
+ try:
210
+ row = db.fetchone("SELECT COUNT(*) as count FROM citations")
211
+ citation_count = row["count"] if row else 0
212
+ except Exception:
213
+ citation_count = 0
214
+
215
+ return {
216
+ "mode": "local",
217
+ "db_path": str(Config.get_db_path()),
218
+ "works": work_count,
219
+ "fts_indexed": fts_count,
220
+ "citations": citation_count,
221
+ }