openalex-local 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openalex-local
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Local OpenAlex database with 284M+ works, abstracts, and semantic search
5
5
  Author-email: Yusuke Watanabe <ywatanabe@alumni.u-tokyo.ac.jp>
6
6
  License: AGPL-3.0
File without changes
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "openalex-local"
7
- version = "0.1.0"
7
+ version = "0.3.0"
8
8
  description = "Local OpenAlex database with 284M+ works, abstracts, and semantic search"
9
9
  readme = "README.md"
10
10
  license = {text = "AGPL-3.0"}
@@ -0,0 +1,44 @@
1
+ """
2
+ OpenAlex Local - Local OpenAlex database with 284M+ works and semantic search.
3
+
4
+ Example:
5
+ >>> from openalex_local import search, get
6
+ >>> results = search("machine learning neural networks")
7
+ >>> work = get("W2741809807") # OpenAlex ID
8
+ >>> work = get("10.1038/nature12373") # or DOI
9
+ """
10
+
11
+ __version__ = "0.3.0"
12
+
13
+ from .api import (
14
+ Config,
15
+ SearchResult,
16
+ Work,
17
+ configure,
18
+ configure_http,
19
+ count,
20
+ exists,
21
+ get,
22
+ get_many,
23
+ get_mode,
24
+ info,
25
+ search,
26
+ )
27
+
28
+ __all__ = [
29
+ # Core functions
30
+ "search",
31
+ "count",
32
+ "get",
33
+ "get_many",
34
+ "exists",
35
+ "info",
36
+ # Configuration
37
+ "configure",
38
+ "configure_http",
39
+ "get_mode",
40
+ # Classes
41
+ "Work",
42
+ "SearchResult",
43
+ "Config",
44
+ ]
@@ -0,0 +1,6 @@
1
+ """Allow running as python -m openalex_local."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,258 @@
1
+ """Main API for openalex_local.
2
+
3
+ Supports two modes:
4
+ - db: Direct database access (requires database file)
5
+ - http: HTTP API access (requires API server)
6
+
7
+ Mode is auto-detected or can be set explicitly via:
8
+ - OPENALEX_LOCAL_MODE environment variable ("db" or "http")
9
+ - OPENALEX_LOCAL_API_URL environment variable (API URL)
10
+ - configure() or configure_http() functions
11
+ """
12
+
13
+ from typing import List, Optional
14
+
15
+ from . import fts
16
+ from .config import Config
17
+ from .db import close_db, get_db
18
+ from .models import SearchResult, Work
19
+
20
+ __all__ = [
21
+ "search",
22
+ "count",
23
+ "get",
24
+ "get_many",
25
+ "exists",
26
+ "configure",
27
+ "configure_http",
28
+ "get_mode",
29
+ "info",
30
+ "Work",
31
+ "SearchResult",
32
+ "Config",
33
+ ]
34
+
35
+
36
+ def _get_http_client():
37
+ """Get HTTP client (lazy import to avoid circular dependency)."""
38
+ try:
39
+ from .remote import RemoteClient
40
+
41
+ return RemoteClient(Config.get_api_url())
42
+ except ImportError:
43
+ raise NotImplementedError(
44
+ "HTTP mode not yet implemented. Use database mode by setting "
45
+ "OPENALEX_LOCAL_DB environment variable."
46
+ )
47
+
48
+
49
+ def search(
50
+ query: str,
51
+ limit: int = 20,
52
+ offset: int = 0,
53
+ ) -> SearchResult:
54
+ """
55
+ Full-text search across works.
56
+
57
+ Uses FTS5 index for fast searching across titles and abstracts.
58
+
59
+ Args:
60
+ query: Search query (supports FTS5 syntax)
61
+ limit: Maximum results to return
62
+ offset: Skip first N results (for pagination)
63
+
64
+ Returns:
65
+ SearchResult with matching works
66
+
67
+ Example:
68
+ >>> from openalex_local import search
69
+ >>> results = search("machine learning")
70
+ >>> print(f"Found {results.total} matches")
71
+ """
72
+ if Config.get_mode() == "http":
73
+ client = _get_http_client()
74
+ return client.search(query=query, limit=limit, offset=offset)
75
+ return fts.search(query, limit, offset)
76
+
77
+
78
+ def count(query: str) -> int:
79
+ """
80
+ Count matching works without fetching results.
81
+
82
+ Args:
83
+ query: FTS5 search query
84
+
85
+ Returns:
86
+ Number of matching works
87
+ """
88
+ if Config.get_mode() == "http":
89
+ client = _get_http_client()
90
+ result = client.search(query=query, limit=1)
91
+ return result.total
92
+ return fts.count(query)
93
+
94
+
95
+ def get(id_or_doi: str) -> Optional[Work]:
96
+ """
97
+ Get a work by OpenAlex ID or DOI.
98
+
99
+ Args:
100
+ id_or_doi: OpenAlex ID (e.g., W2741809807) or DOI
101
+
102
+ Returns:
103
+ Work object or None if not found
104
+
105
+ Example:
106
+ >>> from openalex_local import get
107
+ >>> work = get("W2741809807")
108
+ >>> work = get("10.1038/nature12373")
109
+ >>> print(work.title)
110
+ """
111
+ if Config.get_mode() == "http":
112
+ client = _get_http_client()
113
+ return client.get(id_or_doi)
114
+
115
+ db = get_db()
116
+
117
+ # Try as OpenAlex ID first
118
+ if id_or_doi.startswith("W") or id_or_doi.startswith("w"):
119
+ data = db.get_work(id_or_doi.upper())
120
+ if data:
121
+ return Work.from_db_row(data)
122
+
123
+ # Try as DOI
124
+ data = db.get_work_by_doi(id_or_doi)
125
+ if data:
126
+ return Work.from_db_row(data)
127
+
128
+ return None
129
+
130
+
131
+ def get_many(ids: List[str]) -> List[Work]:
132
+ """
133
+ Get multiple works by OpenAlex ID or DOI.
134
+
135
+ Args:
136
+ ids: List of OpenAlex IDs or DOIs
137
+
138
+ Returns:
139
+ List of Work objects (missing IDs are skipped)
140
+ """
141
+ if Config.get_mode() == "http":
142
+ client = _get_http_client()
143
+ return client.get_many(ids)
144
+
145
+ works = []
146
+ for id_or_doi in ids:
147
+ work = get(id_or_doi)
148
+ if work:
149
+ works.append(work)
150
+ return works
151
+
152
+
153
+ def exists(id_or_doi: str) -> bool:
154
+ """
155
+ Check if a work exists in the database.
156
+
157
+ Args:
158
+ id_or_doi: OpenAlex ID or DOI
159
+
160
+ Returns:
161
+ True if work exists
162
+ """
163
+ if Config.get_mode() == "http":
164
+ client = _get_http_client()
165
+ return client.exists(id_or_doi)
166
+
167
+ db = get_db()
168
+
169
+ # Try as OpenAlex ID first
170
+ if id_or_doi.startswith("W") or id_or_doi.startswith("w"):
171
+ row = db.fetchone(
172
+ "SELECT 1 FROM works WHERE openalex_id = ?", (id_or_doi.upper(),)
173
+ )
174
+ if row:
175
+ return True
176
+
177
+ # Try as DOI
178
+ row = db.fetchone("SELECT 1 FROM works WHERE doi = ?", (id_or_doi,))
179
+ return row is not None
180
+
181
+
182
+ def configure(db_path: str) -> None:
183
+ """
184
+ Configure for local database access.
185
+
186
+ Args:
187
+ db_path: Path to OpenAlex SQLite database
188
+
189
+ Example:
190
+ >>> from openalex_local import configure
191
+ >>> configure("/path/to/openalex.db")
192
+ """
193
+ Config.set_db_path(db_path)
194
+ close_db()
195
+
196
+
197
+ def configure_http(api_url: str = "http://localhost:31292") -> None:
198
+ """
199
+ Configure for HTTP API access.
200
+
201
+ Args:
202
+ api_url: URL of OpenAlex Local API server
203
+
204
+ Example:
205
+ >>> from openalex_local import configure_http
206
+ >>> configure_http("http://localhost:31292")
207
+ """
208
+ Config.set_api_url(api_url)
209
+
210
+
211
+ def get_mode() -> str:
212
+ """
213
+ Get current mode.
214
+
215
+ Returns:
216
+ "db" or "http"
217
+ """
218
+ return Config.get_mode()
219
+
220
+
221
+ def info() -> dict:
222
+ """
223
+ Get database/API information.
224
+
225
+ Returns:
226
+ Dictionary with database stats and mode info
227
+
228
+ Raises:
229
+ FileNotFoundError: If no database configured and HTTP mode unavailable
230
+ """
231
+ mode = Config.get_mode()
232
+
233
+ if mode == "http":
234
+ client = _get_http_client()
235
+ http_info = client.info()
236
+ return {"mode": "http", "status": "ok", **http_info}
237
+
238
+ # DB mode - will raise FileNotFoundError if no database
239
+ db = get_db()
240
+
241
+ # Get work count
242
+ row = db.fetchone("SELECT COUNT(*) as count FROM works")
243
+ work_count = row["count"] if row else 0
244
+
245
+ # Get FTS count
246
+ try:
247
+ row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
248
+ fts_count = row["count"] if row else 0
249
+ except Exception:
250
+ fts_count = 0
251
+
252
+ return {
253
+ "status": "ok",
254
+ "mode": "db",
255
+ "db_path": str(Config.get_db_path()),
256
+ "work_count": work_count,
257
+ "fts_indexed": fts_count,
258
+ }
@@ -0,0 +1,209 @@
1
+ #!/usr/bin/env python3
2
+ """CLI for openalex_local."""
3
+
4
+ import json
5
+ import sys
6
+
7
+ import click
8
+
9
+ from . import __version__
10
+
11
+
12
+ def _print_recursive_help(ctx, param, value):
13
+ """Callback for --help-recursive flag."""
14
+ if not value or ctx.resilient_parsing:
15
+ return
16
+
17
+ def _print_command_help(cmd, prefix: str, parent_ctx):
18
+ """Recursively print help for a command and its subcommands."""
19
+ click.secho(f"\n━━━ {prefix} ━━━", fg="cyan", bold=True)
20
+ sub_ctx = click.Context(cmd, info_name=prefix.split()[-1], parent=parent_ctx)
21
+ click.echo(cmd.get_help(sub_ctx))
22
+
23
+ if isinstance(cmd, click.Group):
24
+ for sub_name, sub_cmd in sorted(cmd.commands.items()):
25
+ _print_command_help(sub_cmd, f"{prefix} {sub_name}", sub_ctx)
26
+
27
+ # Print main help
28
+ click.secho("━━━ openalex-local ━━━", fg="cyan", bold=True)
29
+ click.echo(ctx.get_help())
30
+
31
+ # Print all subcommands recursively
32
+ for name, cmd in sorted(cli.commands.items()):
33
+ _print_command_help(cmd, f"openalex-local {name}", ctx)
34
+
35
+ ctx.exit(0)
36
+
37
+
38
+ @click.group(context_settings={"help_option_names": ["-h", "--help"]})
39
+ @click.version_option(__version__, "--version")
40
+ @click.option("--http", is_flag=True, help="Use HTTP API instead of direct database")
41
+ @click.option("--api-url", help="API URL for http mode (default: auto-detect)")
42
+ @click.option(
43
+ "--help-recursive",
44
+ is_flag=True,
45
+ is_eager=True,
46
+ expose_value=False,
47
+ callback=_print_recursive_help,
48
+ help="Show help for all commands recursively.",
49
+ )
50
+ @click.pass_context
51
+ def cli(ctx, http, api_url):
52
+ """
53
+ Local OpenAlex database with 284M+ works and full-text search.
54
+
55
+ \b
56
+ Supports both direct database access (db mode) and HTTP API (http mode).
57
+
58
+ \b
59
+ DB mode (default if database found):
60
+ openalex-local search "machine learning"
61
+
62
+ \b
63
+ HTTP mode (connect to API server):
64
+ openalex-local --http search "machine learning"
65
+ """
66
+ ctx.ensure_object(dict)
67
+
68
+ if http or api_url:
69
+ from . import configure_http
70
+
71
+ configure_http(api_url or "http://localhost:31292")
72
+
73
+
74
+ @cli.command("search")
75
+ @click.argument("query")
76
+ @click.option("-n", "--number", default=10, help="Number of results")
77
+ @click.option("-o", "--offset", default=0, help="Skip first N results")
78
+ @click.option("-a", "--abstracts", is_flag=True, help="Show abstracts")
79
+ @click.option("-A", "--authors", is_flag=True, help="Show authors")
80
+ @click.option("--concepts", is_flag=True, help="Show concepts/topics")
81
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
82
+ def search_cmd(query, number, offset, abstracts, authors, concepts, as_json):
83
+ """Search for works by title, abstract, or authors."""
84
+ from . import search
85
+
86
+ try:
87
+ results = search(query, limit=number, offset=offset)
88
+ except FileNotFoundError as e:
89
+ click.secho(f"Error: {e}", fg="red", err=True)
90
+ sys.exit(1)
91
+
92
+ if as_json:
93
+ output = {
94
+ "query": query,
95
+ "total": results.total,
96
+ "elapsed_ms": results.elapsed_ms,
97
+ "works": [w.to_dict() for w in results.works],
98
+ }
99
+ click.echo(json.dumps(output, indent=2))
100
+ return
101
+
102
+ click.secho(
103
+ f"Found {results.total:,} matches in {results.elapsed_ms:.1f}ms\n",
104
+ fg="green",
105
+ )
106
+
107
+ for i, work in enumerate(results.works, 1):
108
+ click.secho(f"{i}. {work.title} ({work.year})", fg="cyan", bold=True)
109
+ click.echo(f" DOI: {work.doi or 'N/A'}")
110
+ click.echo(f" Journal: {work.source or 'N/A'}")
111
+
112
+ if authors and work.authors:
113
+ author_str = ", ".join(work.authors[:5])
114
+ if len(work.authors) > 5:
115
+ author_str += f" (+{len(work.authors) - 5} more)"
116
+ click.echo(f" Authors: {author_str}")
117
+
118
+ if abstracts and work.abstract:
119
+ abstract = work.abstract[:300]
120
+ if len(work.abstract) > 300:
121
+ abstract += "..."
122
+ click.echo(f" Abstract: {abstract}")
123
+
124
+ if concepts and work.concepts:
125
+ concept_names = [c.get("name", "") for c in work.concepts[:5]]
126
+ click.echo(f" Concepts: {', '.join(concept_names)}")
127
+
128
+ click.echo()
129
+
130
+
131
+ @cli.command("search-by-doi")
132
+ @click.argument("doi")
133
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
134
+ def search_by_doi_cmd(doi, as_json):
135
+ """Search for a work by DOI."""
136
+ from . import get
137
+
138
+ try:
139
+ work = get(doi)
140
+ except FileNotFoundError as e:
141
+ click.secho(f"Error: {e}", fg="red", err=True)
142
+ sys.exit(1)
143
+
144
+ if work is None:
145
+ click.secho(f"Not found: {doi}", fg="red", err=True)
146
+ sys.exit(1)
147
+
148
+ if as_json:
149
+ click.echo(json.dumps(work.to_dict(), indent=2))
150
+ return
151
+
152
+ click.secho(work.title, fg="cyan", bold=True)
153
+ click.echo(f"DOI: {work.doi}")
154
+ click.echo(f"OpenAlex ID: {work.openalex_id}")
155
+ click.echo(f"Year: {work.year or 'N/A'}")
156
+ click.echo(f"Journal: {work.source or 'N/A'}")
157
+ click.echo(f"Type: {work.type or 'N/A'}")
158
+ click.echo(f"Citations: {work.cited_by_count or 0}")
159
+
160
+ if work.authors:
161
+ click.echo(f"Authors: {', '.join(work.authors)}")
162
+
163
+ if work.abstract:
164
+ click.echo(f"\nAbstract:\n{work.abstract}")
165
+
166
+ if work.is_oa and work.oa_url:
167
+ click.echo(f"\nOpen Access: {work.oa_url}")
168
+
169
+
170
+ @cli.command("status")
171
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
172
+ def status_cmd(as_json):
173
+ """Show status and configuration."""
174
+ from . import info
175
+
176
+ try:
177
+ status = info()
178
+ except FileNotFoundError as e:
179
+ if as_json:
180
+ click.echo(json.dumps({"status": "error", "error": str(e)}, indent=2))
181
+ else:
182
+ click.secho(f"Error: {e}", fg="red", err=True)
183
+ sys.exit(1)
184
+
185
+ if as_json:
186
+ click.echo(json.dumps(status, indent=2))
187
+ return
188
+
189
+ click.secho("OpenAlex Local Status", fg="cyan", bold=True)
190
+ click.echo(f"Mode: {status.get('mode', 'unknown')}")
191
+ click.echo(f"Status: {status.get('status', 'unknown')}")
192
+
193
+ if "db_path" in status:
194
+ click.echo(f"Database: {status['db_path']}")
195
+
196
+ if "work_count" in status:
197
+ click.echo(f"Works: {status['work_count']:,}")
198
+
199
+ if "fts_indexed" in status:
200
+ click.echo(f"FTS Indexed: {status['fts_indexed']:,}")
201
+
202
+
203
+ def main():
204
+ """Entry point for CLI."""
205
+ cli()
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()
@@ -0,0 +1,182 @@
1
+ """Configuration for openalex_local."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ # Default database locations (checked in order)
8
+ DEFAULT_DB_PATHS = [
9
+ Path("/home/ywatanabe/proj/openalex-local/data/openalex.db"),
10
+ Path("/home/ywatanabe/proj/openalex_local/data/openalex.db"),
11
+ Path("/mnt/nas_ug/openalex_local/data/openalex.db"),
12
+ Path.home() / ".openalex_local" / "openalex.db",
13
+ Path.cwd() / "data" / "openalex.db",
14
+ ]
15
+
16
+
17
+ def get_db_path() -> Path:
18
+ """
19
+ Get database path from environment or auto-detect.
20
+
21
+ Priority:
22
+ 1. OPENALEX_LOCAL_DB environment variable
23
+ 2. First existing path from DEFAULT_DB_PATHS
24
+
25
+ Returns:
26
+ Path to the database file
27
+
28
+ Raises:
29
+ FileNotFoundError: If no database found
30
+ """
31
+ # Check environment variable first
32
+ env_path = os.environ.get("OPENALEX_LOCAL_DB")
33
+ if env_path:
34
+ path = Path(env_path)
35
+ if path.exists():
36
+ return path
37
+ raise FileNotFoundError(f"OPENALEX_LOCAL_DB path not found: {env_path}")
38
+
39
+ # Auto-detect from default locations
40
+ for path in DEFAULT_DB_PATHS:
41
+ if path.exists():
42
+ return path
43
+
44
+ raise FileNotFoundError(
45
+ "OpenAlex database not found. Set OPENALEX_LOCAL_DB environment variable "
46
+ f"or place database at one of: {[str(p) for p in DEFAULT_DB_PATHS]}"
47
+ )
48
+
49
+
50
+ # Default port (SciTeX port scheme: 31292 for openalex)
51
+ DEFAULT_PORT = 31292
52
+ DEFAULT_HOST = "0.0.0.0"
53
+
54
+
55
+ class Config:
56
+ """Configuration container."""
57
+
58
+ _db_path: Optional[Path] = None
59
+ _api_url: Optional[str] = None
60
+ _mode: Optional[str] = None # "db" or "http"
61
+ _port: Optional[int] = None
62
+ _host: Optional[str] = None
63
+
64
+ @classmethod
65
+ def get_db_path(cls) -> Path:
66
+ """Get or auto-detect database path."""
67
+ if cls._db_path is None:
68
+ cls._db_path = get_db_path()
69
+ return cls._db_path
70
+
71
+ @classmethod
72
+ def set_db_path(cls, path: str | Path) -> None:
73
+ """Set database path explicitly."""
74
+ path = Path(path)
75
+ if not path.exists():
76
+ raise FileNotFoundError(f"Database not found: {path}")
77
+ cls._db_path = path
78
+ cls._mode = "db"
79
+
80
+ @classmethod
81
+ def get_api_url(cls) -> str:
82
+ """Get API URL for HTTP mode."""
83
+ if cls._api_url:
84
+ return cls._api_url
85
+
86
+ # Check environment variables (scitex priority)
87
+ for var in [
88
+ "SCITEX_SCHOLAR_OPENALEX_API_URL",
89
+ "OPENALEX_LOCAL_API_URL",
90
+ ]:
91
+ url = os.environ.get(var)
92
+ if url:
93
+ return url
94
+
95
+ return "http://localhost:31292"
96
+
97
+ @classmethod
98
+ def set_api_url(cls, url: str) -> None:
99
+ """Set API URL explicitly."""
100
+ cls._api_url = url
101
+ cls._mode = "http"
102
+
103
+ @classmethod
104
+ def get_mode(cls) -> str:
105
+ """
106
+ Get current mode.
107
+
108
+ Priority:
109
+ 1. Explicitly set mode
110
+ 2. OPENALEX_LOCAL_MODE environment variable
111
+ 3. Auto-detect based on available config
112
+
113
+ Returns:
114
+ "db" or "http"
115
+ """
116
+ if cls._mode:
117
+ return cls._mode
118
+
119
+ # Check environment variable
120
+ env_mode = os.environ.get("OPENALEX_LOCAL_MODE", "").lower()
121
+ if env_mode in ("db", "http"):
122
+ return env_mode
123
+
124
+ # Check if API URL is set
125
+ if os.environ.get("OPENALEX_LOCAL_API_URL"):
126
+ return "http"
127
+
128
+ # Default to db mode (will raise FileNotFoundError if no database)
129
+ return "db"
130
+
131
+ @classmethod
132
+ def get_port(cls) -> int:
133
+ """Get server port."""
134
+ if cls._port:
135
+ return cls._port
136
+
137
+ # Check environment variables (scitex priority)
138
+ for var in [
139
+ "SCITEX_SCHOLAR_OPENALEX_PORT",
140
+ "OPENALEX_LOCAL_PORT",
141
+ ]:
142
+ port = os.environ.get(var)
143
+ if port:
144
+ return int(port)
145
+
146
+ return DEFAULT_PORT
147
+
148
+ @classmethod
149
+ def set_port(cls, port: int) -> None:
150
+ """Set server port explicitly."""
151
+ cls._port = port
152
+
153
+ @classmethod
154
+ def get_host(cls) -> str:
155
+ """Get server host."""
156
+ if cls._host:
157
+ return cls._host
158
+
159
+ # Check environment variables (scitex priority)
160
+ for var in [
161
+ "SCITEX_SCHOLAR_OPENALEX_HOST",
162
+ "OPENALEX_LOCAL_HOST",
163
+ ]:
164
+ host = os.environ.get(var)
165
+ if host:
166
+ return host
167
+
168
+ return DEFAULT_HOST
169
+
170
+ @classmethod
171
+ def set_host(cls, host: str) -> None:
172
+ """Set server host explicitly."""
173
+ cls._host = host
174
+
175
+ @classmethod
176
+ def reset(cls) -> None:
177
+ """Reset configuration (for testing)."""
178
+ cls._db_path = None
179
+ cls._api_url = None
180
+ cls._mode = None
181
+ cls._port = None
182
+ cls._host = None
@@ -0,0 +1,161 @@
1
+ """Database connection handling for openalex_local."""
2
+
3
+ import json as _json
4
+ import sqlite3 as _sqlite3
5
+ from contextlib import contextmanager as _contextmanager
6
+ from pathlib import Path as _Path
7
+ from typing import Any, Dict, Generator, List, Optional
8
+
9
+ from .config import Config as _Config
10
+
11
+ __all__ = [
12
+ "Database",
13
+ "get_db",
14
+ "close_db",
15
+ "connection",
16
+ ]
17
+
18
+
19
+ class Database:
20
+ """
21
+ Database connection manager.
22
+
23
+ Supports both direct usage and context manager pattern.
24
+ """
25
+
26
+ def __init__(self, db_path: Optional[str | _Path] = None):
27
+ """
28
+ Initialize database connection.
29
+
30
+ Args:
31
+ db_path: Path to database. If None, auto-detects.
32
+ """
33
+ if db_path:
34
+ self.db_path = _Path(db_path)
35
+ else:
36
+ self.db_path = _Config.get_db_path()
37
+
38
+ self.conn: Optional[_sqlite3.Connection] = None
39
+ self._connect()
40
+
41
+ def _connect(self) -> None:
42
+ """Establish database connection."""
43
+ self.conn = _sqlite3.connect(self.db_path, check_same_thread=False)
44
+ self.conn.row_factory = _sqlite3.Row
45
+
46
+ def close(self) -> None:
47
+ """Close database connection."""
48
+ if self.conn:
49
+ self.conn.close()
50
+ self.conn = None
51
+
52
+ def __enter__(self) -> "Database":
53
+ return self
54
+
55
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
56
+ self.close()
57
+
58
+ def execute(self, query: str, params: tuple = ()) -> _sqlite3.Cursor:
59
+ """Execute SQL query."""
60
+ return self.conn.execute(query, params)
61
+
62
+ def fetchone(self, query: str, params: tuple = ()) -> Optional[_sqlite3.Row]:
63
+ """Execute query and fetch one result."""
64
+ cursor = self.execute(query, params)
65
+ return cursor.fetchone()
66
+
67
+ def fetchall(self, query: str, params: tuple = ()) -> List[_sqlite3.Row]:
68
+ """Execute query and fetch all results."""
69
+ cursor = self.execute(query, params)
70
+ return cursor.fetchall()
71
+
72
+ def get_work(self, openalex_id: str) -> Optional[Dict[str, Any]]:
73
+ """
74
+ Get work data by OpenAlex ID.
75
+
76
+ Args:
77
+ openalex_id: OpenAlex ID (e.g., W2741809807)
78
+
79
+ Returns:
80
+ Work data dictionary or None
81
+ """
82
+ row = self.fetchone("SELECT * FROM works WHERE openalex_id = ?", (openalex_id,))
83
+ if row:
84
+ return self._row_to_dict(row)
85
+ return None
86
+
87
+ def get_work_by_doi(self, doi: str) -> Optional[Dict[str, Any]]:
88
+ """
89
+ Get work data by DOI.
90
+
91
+ Args:
92
+ doi: DOI string
93
+
94
+ Returns:
95
+ Work data dictionary or None
96
+ """
97
+ row = self.fetchone("SELECT * FROM works WHERE doi = ?", (doi,))
98
+ if row:
99
+ return self._row_to_dict(row)
100
+ return None
101
+
102
+ def _row_to_dict(self, row: _sqlite3.Row) -> Dict[str, Any]:
103
+ """Convert SQLite row to dictionary, parsing JSON fields."""
104
+ result = dict(row)
105
+
106
+ # Parse JSON fields
107
+ for field in ["authors_json", "concepts_json", "topics_json"]:
108
+ if field in result and result[field]:
109
+ try:
110
+ result[field.replace("_json", "")] = _json.loads(result[field])
111
+ except (TypeError, _json.JSONDecodeError):
112
+ result[field.replace("_json", "")] = []
113
+
114
+ # Parse raw_json if present
115
+ if "raw_json" in result and result["raw_json"]:
116
+ try:
117
+ result["raw"] = _json.loads(result["raw_json"])
118
+ except (TypeError, _json.JSONDecodeError):
119
+ result["raw"] = {}
120
+
121
+ return result
122
+
123
+
124
+ # Singleton connection for convenience functions
125
+ _db: Optional[Database] = None
126
+
127
+
128
+ def get_db() -> Database:
129
+ """Get or create singleton database connection."""
130
+ global _db
131
+ if _db is None:
132
+ _db = Database()
133
+ return _db
134
+
135
+
136
+ def close_db() -> None:
137
+ """Close singleton database connection."""
138
+ global _db
139
+ if _db:
140
+ _db.close()
141
+ _db = None
142
+
143
+
144
+ @_contextmanager
145
+ def connection(
146
+ db_path: Optional[str | _Path] = None,
147
+ ) -> Generator[Database, None, None]:
148
+ """
149
+ Context manager for database connection.
150
+
151
+ Args:
152
+ db_path: Path to database. If None, auto-detects.
153
+
154
+ Yields:
155
+ Database instance
156
+ """
157
+ db = Database(db_path)
158
+ try:
159
+ yield db
160
+ finally:
161
+ db.close()
@@ -0,0 +1,165 @@
1
+ """Full-text search using FTS5."""
2
+
3
+ import re as _re
4
+ import time as _time
5
+ from typing import List, Optional
6
+
7
+ from .db import Database, get_db
8
+ from .models import SearchResult, Work
9
+
10
+ __all__ = [
11
+ "search",
12
+ "count",
13
+ "search_ids",
14
+ ]
15
+
16
+
17
+ def _sanitize_query(query: str) -> str:
18
+ """
19
+ Sanitize query for FTS5.
20
+
21
+ Handles special characters that FTS5 interprets as operators.
22
+ """
23
+ if query.startswith('"') and query.endswith('"'):
24
+ return query
25
+
26
+ has_hyphenated_word = _re.search(r"\w+-\w+", query)
27
+ has_special = _re.search(r"[/\\@#$%^&]", query)
28
+
29
+ if has_hyphenated_word or has_special:
30
+ words = query.split()
31
+ quoted = " ".join(f'"{w}"' for w in words)
32
+ return quoted
33
+
34
+ return query
35
+
36
+
37
+ def search(
38
+ query: str,
39
+ limit: int = 20,
40
+ offset: int = 0,
41
+ db: Optional[Database] = None,
42
+ ) -> SearchResult:
43
+ """
44
+ Full-text search across works.
45
+
46
+ Uses FTS5 index for fast searching across titles and abstracts.
47
+
48
+ Args:
49
+ query: Search query (supports FTS5 syntax like AND, OR, NOT, "phrases")
50
+ limit: Maximum results to return
51
+ offset: Skip first N results (for pagination)
52
+ db: Database connection (uses singleton if not provided)
53
+
54
+ Returns:
55
+ SearchResult with matching works
56
+
57
+ Example:
58
+ >>> results = search("machine learning neural networks")
59
+ >>> print(f"Found {results.total} matches in {results.elapsed_ms:.1f}ms")
60
+ """
61
+ if db is None:
62
+ db = get_db()
63
+
64
+ start = _time.perf_counter()
65
+ safe_query = _sanitize_query(query)
66
+
67
+ # Get total count
68
+ count_row = db.fetchone(
69
+ "SELECT COUNT(*) as total FROM works_fts WHERE works_fts MATCH ?",
70
+ (safe_query,),
71
+ )
72
+ total = count_row["total"] if count_row else 0
73
+
74
+ # Get matching works
75
+ rows = db.fetchall(
76
+ """
77
+ SELECT w.*
78
+ FROM works_fts f
79
+ JOIN works w ON f.rowid = w.rowid
80
+ WHERE works_fts MATCH ?
81
+ LIMIT ? OFFSET ?
82
+ """,
83
+ (safe_query, limit, offset),
84
+ )
85
+
86
+ elapsed_ms = (_time.perf_counter() - start) * 1000
87
+
88
+ # Convert to Work objects
89
+ works = []
90
+ for row in rows:
91
+ data = db._row_to_dict(row)
92
+ works.append(Work.from_db_row(data))
93
+
94
+ return SearchResult(
95
+ works=works,
96
+ total=total,
97
+ query=query,
98
+ elapsed_ms=elapsed_ms,
99
+ )
100
+
101
+
102
+ def count(query: str, db: Optional[Database] = None) -> int:
103
+ """
104
+ Count matching works without fetching results.
105
+
106
+ Args:
107
+ query: FTS5 search query
108
+ db: Database connection
109
+
110
+ Returns:
111
+ Number of matching works
112
+ """
113
+ if db is None:
114
+ db = get_db()
115
+
116
+ safe_query = _sanitize_query(query)
117
+ row = db.fetchone(
118
+ "SELECT COUNT(*) as total FROM works_fts WHERE works_fts MATCH ?",
119
+ (safe_query,),
120
+ )
121
+ return row["total"] if row else 0
122
+
123
+
124
+ def search_ids(
125
+ query: str,
126
+ limit: int = 1000,
127
+ db: Optional[Database] = None,
128
+ ) -> List[str]:
129
+ """
130
+ Search and return only OpenAlex IDs (faster than full search).
131
+
132
+ Args:
133
+ query: FTS5 search query
134
+ limit: Maximum IDs to return
135
+ db: Database connection
136
+
137
+ Returns:
138
+ List of matching OpenAlex IDs
139
+ """
140
+ if db is None:
141
+ db = get_db()
142
+
143
+ safe_query = _sanitize_query(query)
144
+ rows = db.fetchall(
145
+ """
146
+ SELECT w.openalex_id
147
+ FROM works_fts f
148
+ JOIN works w ON f.rowid = w.rowid
149
+ WHERE works_fts MATCH ?
150
+ LIMIT ?
151
+ """,
152
+ (safe_query, limit),
153
+ )
154
+
155
+ return [row["openalex_id"] for row in rows]
156
+
157
+
158
+ def _search_with_db(db: Database, query: str, limit: int, offset: int) -> SearchResult:
159
+ """Search with explicit database connection (for thread-safe async)."""
160
+ return search(query, limit, offset, db=db)
161
+
162
+
163
+ def _count_with_db(db: Database, query: str) -> int:
164
+ """Count with explicit database connection (for thread-safe async)."""
165
+ return count(query, db=db)
@@ -66,7 +66,11 @@ class Work:
66
66
  openalex_id = data.get("id", "").replace("https://openalex.org/", "")
67
67
 
68
68
  # Extract DOI
69
- doi = data.get("doi", "").replace("https://doi.org/", "") if data.get("doi") else None
69
+ doi = (
70
+ data.get("doi", "").replace("https://doi.org/", "")
71
+ if data.get("doi")
72
+ else None
73
+ )
70
74
 
71
75
  # Extract authors
72
76
  authors = []
@@ -81,7 +85,11 @@ class Work:
81
85
  inv_index = data.get("abstract_inverted_index")
82
86
  if inv_index:
83
87
  words = sorted(
84
- [(pos, word) for word, positions in inv_index.items() for pos in positions]
88
+ [
89
+ (pos, word)
90
+ for word, positions in inv_index.items()
91
+ for pos in positions
92
+ ]
85
93
  )
86
94
  abstract = " ".join(word for _, word in words)
87
95
 
@@ -103,7 +111,10 @@ class Work:
103
111
 
104
112
  # Extract topics (top 3)
105
113
  topics = [
106
- {"name": t.get("display_name"), "subfield": t.get("subfield", {}).get("display_name")}
114
+ {
115
+ "name": t.get("display_name"),
116
+ "subfield": t.get("subfield", {}).get("display_name"),
117
+ }
107
118
  for t in (data.get("topics") or [])[:3]
108
119
  ]
109
120
 
@@ -135,6 +146,39 @@ class Work:
135
146
  oa_url=oa_info.get("oa_url"),
136
147
  )
137
148
 
149
+ @classmethod
150
+ def from_db_row(cls, data: dict) -> "Work":
151
+ """
152
+ Create Work from database row dictionary.
153
+
154
+ Args:
155
+ data: Database row as dictionary (with parsed JSON fields)
156
+
157
+ Returns:
158
+ Work instance
159
+ """
160
+ return cls(
161
+ openalex_id=data.get("openalex_id", ""),
162
+ doi=data.get("doi"),
163
+ title=data.get("title"),
164
+ abstract=data.get("abstract"),
165
+ authors=data.get("authors", []),
166
+ year=data.get("year"),
167
+ source=data.get("source"),
168
+ issn=data.get("issn"),
169
+ volume=data.get("volume"),
170
+ issue=data.get("issue"),
171
+ pages=data.get("pages"),
172
+ publisher=data.get("publisher"),
173
+ type=data.get("type"),
174
+ concepts=data.get("concepts", []),
175
+ topics=data.get("topics", []),
176
+ cited_by_count=data.get("cited_by_count"),
177
+ referenced_works=data.get("referenced_works", []),
178
+ is_oa=bool(data.get("is_oa", False)),
179
+ oa_url=data.get("oa_url"),
180
+ )
181
+
138
182
  def to_dict(self) -> dict:
139
183
  """Convert to dictionary."""
140
184
  return {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openalex-local
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Local OpenAlex database with 284M+ works, abstracts, and semantic search
5
5
  Author-email: Yusuke Watanabe <ywatanabe@alumni.u-tokyo.ac.jp>
6
6
  License: AGPL-3.0
@@ -1,7 +1,12 @@
1
1
  README.md
2
2
  pyproject.toml
3
3
  src/openalex_local/__init__.py
4
+ src/openalex_local/__main__.py
5
+ src/openalex_local/api.py
6
+ src/openalex_local/cli.py
4
7
  src/openalex_local/config.py
8
+ src/openalex_local/db.py
9
+ src/openalex_local/fts.py
5
10
  src/openalex_local/models.py
6
11
  src/openalex_local.egg-info/PKG-INFO
7
12
  src/openalex_local.egg-info/SOURCES.txt
@@ -1,14 +0,0 @@
1
- """
2
- OpenAlex Local - Local OpenAlex database with 284M+ works and semantic search.
3
-
4
- Example:
5
- >>> from openalex_local import search, get
6
- >>> results = search("machine learning neural networks")
7
- >>> work = get("W2741809807") # OpenAlex ID
8
- >>> work = get("10.1038/nature12373") # or DOI
9
- """
10
-
11
- __version__ = "0.1.0"
12
-
13
- # API will be exposed here after implementation
14
- # from .api import search, get, count, info
@@ -1,73 +0,0 @@
1
- """Configuration for openalex_local."""
2
-
3
- import os
4
- from pathlib import Path
5
- from typing import Optional
6
-
7
- # Default database locations (checked in order)
8
- DEFAULT_DB_PATHS = [
9
- Path("/home/ywatanabe/proj/openalex-local/data/openalex.db"),
10
- Path("/home/ywatanabe/proj/openalex_local/data/openalex.db"),
11
- Path("/mnt/nas_ug/openalex_local/data/openalex.db"),
12
- Path.home() / ".openalex_local" / "openalex.db",
13
- Path.cwd() / "data" / "openalex.db",
14
- ]
15
-
16
-
17
- def get_db_path() -> Path:
18
- """
19
- Get database path from environment or auto-detect.
20
-
21
- Priority:
22
- 1. OPENALEX_LOCAL_DB environment variable
23
- 2. First existing path from DEFAULT_DB_PATHS
24
-
25
- Returns:
26
- Path to the database file
27
-
28
- Raises:
29
- FileNotFoundError: If no database found
30
- """
31
- # Check environment variable first
32
- env_path = os.environ.get("OPENALEX_LOCAL_DB")
33
- if env_path:
34
- path = Path(env_path)
35
- if path.exists():
36
- return path
37
- raise FileNotFoundError(f"OPENALEX_LOCAL_DB path not found: {env_path}")
38
-
39
- # Auto-detect from default locations
40
- for path in DEFAULT_DB_PATHS:
41
- if path.exists():
42
- return path
43
-
44
- raise FileNotFoundError(
45
- "OpenAlex database not found. Set OPENALEX_LOCAL_DB environment variable "
46
- f"or place database at one of: {[str(p) for p in DEFAULT_DB_PATHS]}"
47
- )
48
-
49
-
50
- class Config:
51
- """Configuration container."""
52
-
53
- _db_path: Optional[Path] = None
54
-
55
- @classmethod
56
- def get_db_path(cls) -> Path:
57
- """Get or auto-detect database path."""
58
- if cls._db_path is None:
59
- cls._db_path = get_db_path()
60
- return cls._db_path
61
-
62
- @classmethod
63
- def set_db_path(cls, path: str | Path) -> None:
64
- """Set database path explicitly."""
65
- path = Path(path)
66
- if not path.exists():
67
- raise FileNotFoundError(f"Database not found: {path}")
68
- cls._db_path = path
69
-
70
- @classmethod
71
- def reset(cls) -> None:
72
- """Reset configuration (for testing)."""
73
- cls._db_path = None
File without changes