crossref-local 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ """Export functionality for Work and SearchResult objects.
2
+
3
+ Supports multiple output formats:
4
+ - text: Human-readable formatted text
5
+ - json: JSON format with all fields
6
+ - bibtex: BibTeX bibliography format
7
+ """
8
+
9
+ import json as _json
10
+ from pathlib import Path as _Path
11
+ from typing import TYPE_CHECKING, List, Optional, Union
12
+
13
+ if TYPE_CHECKING:
14
+ from .models import SearchResult, Work
15
+
16
+ __all__ = [
17
+ "save",
18
+ "export_text",
19
+ "export_json",
20
+ "export_bibtex",
21
+ "SUPPORTED_FORMATS",
22
+ ]
23
+
24
+ SUPPORTED_FORMATS = ["text", "json", "bibtex"]
25
+
26
+
27
+ def _sanitize_bibtex_key(doi: str) -> str:
28
+ """Convert DOI to valid BibTeX key."""
29
+ return doi.replace("/", "_").replace(".", "_").replace("-", "_")
30
+
31
+
32
+ def _escape_bibtex(text: str) -> str:
33
+ """Escape special characters for BibTeX."""
34
+ if not text:
35
+ return ""
36
+ # Escape special LaTeX characters
37
+ replacements = [
38
+ ("&", r"\&"),
39
+ ("%", r"\%"),
40
+ ("$", r"\$"),
41
+ ("#", r"\#"),
42
+ ("_", r"\_"),
43
+ ("{", r"\{"),
44
+ ("}", r"\}"),
45
+ ]
46
+ for old, new in replacements:
47
+ text = text.replace(old, new)
48
+ return text
49
+
50
+
51
+ def work_to_text(work: "Work", include_abstract: bool = False) -> str:
52
+ """Convert a Work to human-readable text format.
53
+
54
+ Args:
55
+ work: Work object to convert
56
+ include_abstract: Whether to include abstract
57
+
58
+ Returns:
59
+ Formatted text string
60
+ """
61
+ lines = []
62
+
63
+ # Title
64
+ title = work.title or "Untitled"
65
+ year = f"({work.year})" if work.year else ""
66
+ lines.append(f"{title} {year}".strip())
67
+
68
+ # Authors
69
+ if work.authors:
70
+ authors_str = ", ".join(work.authors[:5])
71
+ if len(work.authors) > 5:
72
+ authors_str += f" et al. ({len(work.authors)} authors)"
73
+ lines.append(f"Authors: {authors_str}")
74
+
75
+ # Journal and DOI
76
+ if work.journal:
77
+ journal_line = f"Journal: {work.journal}"
78
+ if work.volume:
79
+ journal_line += f", {work.volume}"
80
+ if work.issue:
81
+ journal_line += f"({work.issue})"
82
+ if work.page:
83
+ journal_line += f", {work.page}"
84
+ lines.append(journal_line)
85
+
86
+ lines.append(f"DOI: {work.doi}")
87
+
88
+ # Impact factor
89
+ if work.impact_factor:
90
+ lines.append(
91
+ f"Impact Factor: {work.impact_factor:.2f} ({work.impact_factor_source or 'unknown'})"
92
+ )
93
+
94
+ # Citation count
95
+ if work.citation_count is not None:
96
+ lines.append(f"Citations: {work.citation_count}")
97
+
98
+ # Abstract
99
+ if include_abstract and work.abstract:
100
+ # Strip XML tags
101
+ import re
102
+
103
+ abstract = re.sub(r"<[^>]+>", " ", work.abstract)
104
+ abstract = re.sub(r"\s+", " ", abstract).strip()
105
+ lines.append(f"Abstract: {abstract}")
106
+
107
+ return "\n".join(lines)
108
+
109
+
110
+ def work_to_bibtex(work: "Work") -> str:
111
+ """Convert a Work to BibTeX format.
112
+
113
+ Args:
114
+ work: Work object to convert
115
+
116
+ Returns:
117
+ BibTeX entry string
118
+ """
119
+ key = _sanitize_bibtex_key(work.doi) if work.doi else "unknown"
120
+ work_type = work.type or "article"
121
+
122
+ # Map CrossRef types to BibTeX types
123
+ bibtex_type_map = {
124
+ "journal-article": "article",
125
+ "book-chapter": "incollection",
126
+ "book": "book",
127
+ "proceedings-article": "inproceedings",
128
+ "dissertation": "phdthesis",
129
+ "report": "techreport",
130
+ }
131
+ bibtex_type = bibtex_type_map.get(work_type, "misc")
132
+
133
+ lines = [f"@{bibtex_type}{{{key},"]
134
+
135
+ if work.title:
136
+ lines.append(f" title = {{{_escape_bibtex(work.title)}}},")
137
+
138
+ if work.authors:
139
+ authors = " and ".join(work.authors)
140
+ lines.append(f" author = {{{_escape_bibtex(authors)}}},")
141
+
142
+ if work.year:
143
+ lines.append(f" year = {{{work.year}}},")
144
+
145
+ if work.journal:
146
+ lines.append(f" journal = {{{_escape_bibtex(work.journal)}}},")
147
+
148
+ if work.volume:
149
+ lines.append(f" volume = {{{work.volume}}},")
150
+
151
+ if work.issue:
152
+ lines.append(f" number = {{{work.issue}}},")
153
+
154
+ if work.page:
155
+ lines.append(f" pages = {{{work.page}}},")
156
+
157
+ if work.publisher:
158
+ lines.append(f" publisher = {{{_escape_bibtex(work.publisher)}}},")
159
+
160
+ if work.doi:
161
+ lines.append(f" doi = {{{work.doi}}},")
162
+
163
+ if work.url:
164
+ lines.append(f" url = {{{work.url}}},")
165
+
166
+ if work.issn:
167
+ lines.append(f" issn = {{{work.issn}}},")
168
+
169
+ lines.append("}")
170
+
171
+ return "\n".join(lines)
172
+
173
+
174
+ def export_text(
175
+ works: List["Work"],
176
+ include_abstract: bool = False,
177
+ query: Optional[str] = None,
178
+ total: Optional[int] = None,
179
+ elapsed_ms: Optional[float] = None,
180
+ ) -> str:
181
+ """Export works to text format.
182
+
183
+ Args:
184
+ works: List of Work objects
185
+ include_abstract: Whether to include abstracts
186
+ query: Original search query (for header)
187
+ total: Total number of matches
188
+ elapsed_ms: Search time in milliseconds
189
+
190
+ Returns:
191
+ Formatted text string
192
+ """
193
+ lines = []
194
+
195
+ # Header
196
+ if query is not None:
197
+ lines.append(f"Search: {query}")
198
+ if total is not None:
199
+ lines.append(f"Found: {total:,} matches")
200
+ if elapsed_ms is not None:
201
+ lines.append(f"Time: {elapsed_ms:.1f}ms")
202
+ lines.append("")
203
+ lines.append("=" * 60)
204
+ lines.append("")
205
+
206
+ # Works
207
+ for i, work in enumerate(works, 1):
208
+ lines.append(f"[{i}]")
209
+ lines.append(work_to_text(work, include_abstract=include_abstract))
210
+ lines.append("")
211
+ lines.append("-" * 40)
212
+ lines.append("")
213
+
214
+ return "\n".join(lines)
215
+
216
+
217
+ def export_json(
218
+ works: List["Work"],
219
+ query: Optional[str] = None,
220
+ total: Optional[int] = None,
221
+ elapsed_ms: Optional[float] = None,
222
+ indent: int = 2,
223
+ ) -> str:
224
+ """Export works to JSON format.
225
+
226
+ Args:
227
+ works: List of Work objects
228
+ query: Original search query
229
+ total: Total number of matches
230
+ elapsed_ms: Search time in milliseconds
231
+ indent: JSON indentation
232
+
233
+ Returns:
234
+ JSON string
235
+ """
236
+ data = {
237
+ "works": [w.to_dict() for w in works],
238
+ }
239
+
240
+ if query is not None:
241
+ data["query"] = query
242
+ if total is not None:
243
+ data["total"] = total
244
+ if elapsed_ms is not None:
245
+ data["elapsed_ms"] = elapsed_ms
246
+
247
+ return _json.dumps(data, indent=indent, ensure_ascii=False)
248
+
249
+
250
+ def export_bibtex(works: List["Work"]) -> str:
251
+ """Export works to BibTeX format.
252
+
253
+ Args:
254
+ works: List of Work objects
255
+
256
+ Returns:
257
+ BibTeX string with all entries
258
+ """
259
+ entries = [work_to_bibtex(w) for w in works]
260
+ return "\n\n".join(entries)
261
+
262
+
263
+ def save(
264
+ data: Union["Work", "SearchResult", List["Work"]],
265
+ path: Union[str, _Path],
266
+ format: str = "json",
267
+ include_abstract: bool = True,
268
+ ) -> str:
269
+ """Save Work(s) or SearchResult to a file.
270
+
271
+ Args:
272
+ data: Work, SearchResult, or list of Works to save
273
+ path: Output file path
274
+ format: Output format ("text", "json", "bibtex")
275
+ include_abstract: Include abstracts in text format
276
+
277
+ Returns:
278
+ Path to saved file
279
+
280
+ Raises:
281
+ ValueError: If format is not supported
282
+
283
+ Examples:
284
+ >>> from crossref_local import search, save
285
+ >>> results = search("machine learning", limit=10)
286
+ >>> save(results, "results.json")
287
+ >>> save(results, "results.bib", format="bibtex")
288
+ >>> save(results, "results.txt", format="text")
289
+ """
290
+ from .models import SearchResult, Work
291
+
292
+ if format not in SUPPORTED_FORMATS:
293
+ raise ValueError(
294
+ f"Unsupported format: {format}. "
295
+ f"Supported formats: {', '.join(SUPPORTED_FORMATS)}"
296
+ )
297
+
298
+ path = _Path(path)
299
+
300
+ # Extract works and metadata
301
+ if isinstance(data, Work):
302
+ works = [data]
303
+ query = None
304
+ total = None
305
+ elapsed_ms = None
306
+ elif isinstance(data, SearchResult):
307
+ works = data.works
308
+ query = data.query
309
+ total = data.total
310
+ elapsed_ms = data.elapsed_ms
311
+ elif isinstance(data, list):
312
+ works = data
313
+ query = None
314
+ total = len(data)
315
+ elapsed_ms = None
316
+ else:
317
+ raise TypeError(f"Unsupported data type: {type(data)}")
318
+
319
+ # Generate content
320
+ if format == "text":
321
+ content = export_text(
322
+ works,
323
+ include_abstract=include_abstract,
324
+ query=query,
325
+ total=total,
326
+ elapsed_ms=elapsed_ms,
327
+ )
328
+ elif format == "json":
329
+ content = export_json(
330
+ works,
331
+ query=query,
332
+ total=total,
333
+ elapsed_ms=elapsed_ms,
334
+ )
335
+ elif format == "bibtex":
336
+ content = export_bibtex(works)
337
+ else:
338
+ raise ValueError(f"Unsupported format: {format}")
339
+
340
+ # Write to file
341
+ path.parent.mkdir(parents=True, exist_ok=True)
342
+ path.write_text(content, encoding="utf-8")
343
+
344
+ return str(path)
@@ -5,7 +5,7 @@ import time as _time
5
5
  from typing import List, Optional
6
6
 
7
7
  from .db import Database, get_db
8
- from .models import SearchResult, Work
8
+ from .models import LimitInfo, SearchResult, Work
9
9
 
10
10
  __all__ = [
11
11
  "search",
@@ -102,11 +102,30 @@ def search(
102
102
  metadata = db._decompress_metadata(row["metadata"])
103
103
  works.append(Work.from_metadata(row["doi"], metadata))
104
104
 
105
+ # Build limit info
106
+ returned = len(works)
107
+ capped = returned < total and returned == limit
108
+ capped_reason = None
109
+ if capped:
110
+ capped_reason = (
111
+ f"crossref-local: Limited to {limit} results (total available: {total})"
112
+ )
113
+
114
+ limit_info = LimitInfo(
115
+ requested=limit,
116
+ returned=returned,
117
+ total_available=total,
118
+ capped=capped,
119
+ capped_reason=capped_reason,
120
+ stage="crossref-local",
121
+ )
122
+
105
123
  return SearchResult(
106
124
  works=works,
107
125
  total=total,
108
126
  query=query,
109
127
  elapsed_ms=elapsed_ms,
128
+ limit_info=limit_info,
110
129
  )
111
130
 
112
131
 
@@ -7,6 +7,7 @@ from typing import List, Optional
7
7
  __all__ = [
8
8
  "Work",
9
9
  "SearchResult",
10
+ "LimitInfo",
10
11
  ]
11
12
 
12
13
 
@@ -48,6 +49,8 @@ class Work:
48
49
  url: Optional[str] = None
49
50
  citation_count: Optional[int] = None
50
51
  references: List[str] = _field(default_factory=list)
52
+ impact_factor: Optional[float] = None
53
+ impact_factor_source: Optional[str] = None
51
54
 
52
55
  @classmethod
53
56
  def from_metadata(cls, doi: str, metadata: dict) -> "Work":
@@ -130,6 +133,8 @@ class Work:
130
133
  "url": self.url,
131
134
  "citation_count": self.citation_count,
132
135
  "references": self.references,
136
+ "impact_factor": self.impact_factor,
137
+ "impact_factor_source": self.impact_factor_source,
133
138
  }
134
139
 
135
140
  def citation(self, style: str = "apa") -> str:
@@ -163,6 +168,84 @@ class Work:
163
168
 
164
169
  return ". ".join(filter(None, parts))
165
170
 
171
+ def to_text(self, include_abstract: bool = False) -> str:
172
+ """
173
+ Format as human-readable text.
174
+
175
+ Args:
176
+ include_abstract: Include abstract in output
177
+
178
+ Returns:
179
+ Formatted text string
180
+ """
181
+ from .export import work_to_text
182
+
183
+ return work_to_text(self, include_abstract=include_abstract)
184
+
185
+ def to_bibtex(self) -> str:
186
+ """
187
+ Format as BibTeX entry.
188
+
189
+ Returns:
190
+ BibTeX string
191
+ """
192
+ from .export import work_to_bibtex
193
+
194
+ return work_to_bibtex(self)
195
+
196
+ def save(self, path: str, format: str = "json") -> str:
197
+ """
198
+ Save work to file.
199
+
200
+ Args:
201
+ path: Output file path
202
+ format: Output format ("text", "json", "bibtex")
203
+
204
+ Returns:
205
+ Path to saved file
206
+
207
+ Examples:
208
+ >>> work = get("10.1038/nature12373")
209
+ >>> work.save("paper.json")
210
+ >>> work.save("paper.bib", format="bibtex")
211
+ """
212
+ from .export import save
213
+
214
+ return save(self, path, format=format)
215
+
216
+
217
+ @_dataclass
218
+ class LimitInfo:
219
+ """
220
+ Information about result limiting at each stage.
221
+
222
+ Attributes:
223
+ requested: Number of results requested
224
+ returned: Number of results actually returned
225
+ total_available: Total matches in database
226
+ capped: Whether results were capped
227
+ capped_reason: Why results were capped (if applicable)
228
+ stage: Which stage applied this limit (e.g., "crossref-local", "scitex", "django")
229
+ """
230
+
231
+ requested: int
232
+ returned: int
233
+ total_available: int
234
+ capped: bool = False
235
+ capped_reason: Optional[str] = None
236
+ stage: str = "crossref-local"
237
+
238
+ def to_dict(self) -> dict:
239
+ """Convert to dictionary."""
240
+ return {
241
+ "requested": self.requested,
242
+ "returned": self.returned,
243
+ "total_available": self.total_available,
244
+ "capped": self.capped,
245
+ "capped_reason": self.capped_reason,
246
+ "stage": self.stage,
247
+ }
248
+
166
249
 
167
250
  @_dataclass
168
251
  class SearchResult:
@@ -174,12 +257,14 @@ class SearchResult:
174
257
  total: Total number of matches
175
258
  query: Original search query
176
259
  elapsed_ms: Search time in milliseconds
260
+ limit_info: Information about result limiting
177
261
  """
178
262
 
179
263
  works: List[Work]
180
264
  total: int
181
265
  query: str
182
266
  elapsed_ms: float
267
+ limit_info: Optional[LimitInfo] = None
183
268
 
184
269
  def __len__(self) -> int:
185
270
  return len(self.works)
@@ -189,3 +274,27 @@ class SearchResult:
189
274
 
190
275
  def __getitem__(self, idx):
191
276
  return self.works[idx]
277
+
278
+ def save(
279
+ self, path: str, format: str = "json", include_abstract: bool = True
280
+ ) -> str:
281
+ """
282
+ Save search results to file.
283
+
284
+ Args:
285
+ path: Output file path
286
+ format: Output format ("text", "json", "bibtex")
287
+ include_abstract: Include abstracts in text format
288
+
289
+ Returns:
290
+ Path to saved file
291
+
292
+ Examples:
293
+ >>> results = search("machine learning", limit=10)
294
+ >>> results.save("results.json")
295
+ >>> results.save("results.bib", format="bibtex")
296
+ >>> results.save("results.txt", format="text")
297
+ """
298
+ from .export import save
299
+
300
+ return save(self, path, format=format, include_abstract=include_abstract)
@@ -10,7 +10,7 @@ import urllib.parse
10
10
  import urllib.error
11
11
  from typing import List, Optional, Dict, Any
12
12
 
13
- from .._core.models import Work, SearchResult
13
+ from .._core.models import SearchResult, Work
14
14
  from .._core.config import DEFAULT_PORT
15
15
 
16
16
  # Default URL uses SCITEX port convention
@@ -104,6 +104,7 @@ class RemoteClient:
104
104
  year: Optional[int] = None,
105
105
  limit: int = 10,
106
106
  offset: int = 0,
107
+ with_if: bool = False,
107
108
  ) -> SearchResult:
108
109
  """
109
110
  Search for papers.
@@ -114,8 +115,9 @@ class RemoteClient:
114
115
  title: Search by title (explicit)
115
116
  authors: Search by author name
116
117
  year: Filter by publication year
117
- limit: Maximum results (default: 10, max: 100)
118
+ limit: Maximum results (default: 10)
118
119
  offset: Skip first N results for pagination
120
+ with_if: Include impact factor data (OpenAlex)
119
121
 
120
122
  Returns:
121
123
  SearchResult with matching works
@@ -125,8 +127,9 @@ class RemoteClient:
125
127
 
126
128
  params = {
127
129
  "q": search_query,
128
- "limit": min(limit, 100),
130
+ "limit": limit,
129
131
  "offset": offset,
132
+ "with_if": with_if,
130
133
  }
131
134
 
132
135
  data = self._request("/works", params)
@@ -142,19 +145,38 @@ class RemoteClient:
142
145
  authors=item.get("authors", []),
143
146
  year=item.get("year"),
144
147
  journal=item.get("journal"),
148
+ issn=item.get("issn"),
145
149
  volume=item.get("volume"),
146
150
  issue=item.get("issue"),
147
151
  page=item.get("page") or item.get("pages"),
148
152
  abstract=item.get("abstract"),
149
153
  citation_count=item.get("citation_count"),
154
+ impact_factor=item.get("impact_factor"),
155
+ impact_factor_source=item.get("impact_factor_source"),
150
156
  )
151
157
  works.append(work)
152
158
 
159
+ # Parse limit_info from response
160
+ limit_info = None
161
+ if data.get("limit_info"):
162
+ from .._core.models import LimitInfo
163
+
164
+ li = data["limit_info"]
165
+ limit_info = LimitInfo(
166
+ requested=li.get("requested", limit),
167
+ returned=li.get("returned", len(works)),
168
+ total_available=li.get("total_available", data.get("total", 0)),
169
+ capped=li.get("capped", False),
170
+ capped_reason=li.get("capped_reason"),
171
+ stage=li.get("stage", "crossref-local-remote"),
172
+ )
173
+
153
174
  return SearchResult(
154
175
  works=works,
155
176
  total=data.get("total", len(works)),
156
177
  query=query or title or doi or "",
157
178
  elapsed_ms=data.get("elapsed_ms", 0.0),
179
+ limit_info=limit_info,
158
180
  )
159
181
 
160
182
  def get(self, doi: str) -> Optional[Work]:
@@ -20,6 +20,19 @@ class WorkResponse(BaseModel):
20
20
  page: Optional[str] = None
21
21
  abstract: Optional[str] = None
22
22
  citation_count: Optional[int] = None
23
+ impact_factor: Optional[float] = None
24
+ impact_factor_source: Optional[str] = None
25
+
26
+
27
+ class LimitInfoResponse(BaseModel):
28
+ """Information about result limiting."""
29
+
30
+ requested: int
31
+ returned: int
32
+ total_available: int
33
+ capped: bool = False
34
+ capped_reason: Optional[str] = None
35
+ stage: str = "crossref-local"
23
36
 
24
37
 
25
38
  class SearchResponse(BaseModel):
@@ -30,6 +43,7 @@ class SearchResponse(BaseModel):
30
43
  returned: int
31
44
  elapsed_ms: float
32
45
  results: List[WorkResponse]
46
+ limit_info: Optional[LimitInfoResponse] = None
33
47
 
34
48
 
35
49
  class InfoResponse(BaseModel):