crossref-local 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,296 @@
1
+ """Visualization tools for cache analysis.
2
+
3
+ Provides plotting and network visualization for cached paper collections.
4
+
5
+ Usage:
6
+ >>> from crossref_local import cache
7
+ >>> from crossref_local.cache_viz import plot_year_citations, plot_citation_network
8
+ >>> # Scatter plot: year vs citations
9
+ >>> plot_year_citations("epilepsy", output="epilepsy_scatter.png")
10
+ >>> # Citation network
11
+ >>> plot_citation_network("epilepsy", output="epilepsy_network.html")
12
+ """
13
+
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ from . import cache
17
+
18
+
19
+ def plot_year_citations(
20
+ cache_name: str,
21
+ output: Optional[str] = None,
22
+ top_n: int = 10,
23
+ highlight_threshold: Optional[int] = None,
24
+ figsize: Tuple[int, int] = (10, 6),
25
+ ) -> Dict[str, Any]:
26
+ """Plot year vs citation count scatter plot.
27
+
28
+ Helps identify highly cited papers across publication years.
29
+
30
+ Args:
31
+ cache_name: Name of cache to analyze
32
+ output: Output file path (png/pdf/svg). None for interactive display.
33
+ top_n: Number of top-cited papers to label
34
+ highlight_threshold: Citation threshold to highlight (draws horizontal line)
35
+ figsize: Figure size in inches
36
+
37
+ Returns:
38
+ Dict with plot path and top papers list
39
+
40
+ Example:
41
+ >>> plot_year_citations("epilepsy", output="scatter.png", top_n=5)
42
+ """
43
+ try:
44
+ import matplotlib.pyplot as plt
45
+ except ImportError:
46
+ raise ImportError("matplotlib required. Install with: pip install matplotlib")
47
+
48
+ papers = cache.load(cache_name)
49
+
50
+ # Extract year and citations
51
+ data = []
52
+ for p in papers:
53
+ year = p.get("year")
54
+ citations = p.get("citation_count", 0)
55
+ if year and citations is not None:
56
+ data.append(
57
+ {
58
+ "doi": p.get("doi"),
59
+ "title": p.get("title", "")[:50],
60
+ "year": year,
61
+ "citations": citations,
62
+ "journal": p.get("journal", ""),
63
+ }
64
+ )
65
+
66
+ if not data:
67
+ return {"error": "No papers with year and citation data"}
68
+
69
+ # Sort by citations for top-N
70
+ data_sorted = sorted(data, key=lambda x: -x["citations"])
71
+ top_papers = data_sorted[:top_n]
72
+
73
+ # Create scatter plot
74
+ fig, ax = plt.subplots(figsize=figsize)
75
+
76
+ years = [d["year"] for d in data]
77
+ citations = [d["citations"] for d in data]
78
+
79
+ ax.scatter(years, citations, alpha=0.5, s=20)
80
+
81
+ # Highlight top papers
82
+ for p in top_papers:
83
+ ax.annotate(
84
+ p["title"][:30] + "...",
85
+ (p["year"], p["citations"]),
86
+ fontsize=7,
87
+ alpha=0.8,
88
+ xytext=(5, 5),
89
+ textcoords="offset points",
90
+ )
91
+
92
+ # Threshold line
93
+ if highlight_threshold:
94
+ ax.axhline(y=highlight_threshold, color="r", linestyle="--", alpha=0.5)
95
+
96
+ ax.set_xlabel("Publication Year")
97
+ ax.set_ylabel("Citation Count")
98
+ ax.set_title(f"Year vs Citations: {cache_name}")
99
+ ax.grid(True, alpha=0.3)
100
+
101
+ plt.tight_layout()
102
+
103
+ if output:
104
+ plt.savefig(output, dpi=150)
105
+ plt.close()
106
+ result_path = output
107
+ else:
108
+ plt.show()
109
+ result_path = None
110
+
111
+ return {
112
+ "output": result_path,
113
+ "total_papers": len(data),
114
+ "top_papers": [
115
+ {
116
+ "doi": p["doi"],
117
+ "title": p["title"],
118
+ "year": p["year"],
119
+ "citations": p["citations"],
120
+ }
121
+ for p in top_papers
122
+ ],
123
+ }
124
+
125
+
126
+ def plot_citation_network(
127
+ cache_name: str,
128
+ output: Optional[str] = None,
129
+ max_nodes: int = 100,
130
+ include_external: bool = False,
131
+ ) -> Dict[str, Any]:
132
+ """Generate citation network visualization.
133
+
134
+ Creates an interactive HTML network graph showing citation relationships
135
+ between papers in the cache.
136
+
137
+ Args:
138
+ cache_name: Name of cache to analyze
139
+ output: Output HTML file path. None returns network data.
140
+ max_nodes: Maximum papers to include (sorted by citations)
141
+ include_external: Include referenced papers not in cache
142
+
143
+ Returns:
144
+ Dict with network stats and output path
145
+
146
+ Example:
147
+ >>> plot_citation_network("epilepsy", output="network.html", max_nodes=50)
148
+ """
149
+ try:
150
+ from pyvis.network import Network
151
+ except ImportError:
152
+ raise ImportError("pyvis required. Install with: pip install pyvis")
153
+
154
+ papers = cache.load(cache_name)
155
+
156
+ # Build DOI lookup
157
+ doi_to_paper = {p["doi"]: p for p in papers if p.get("doi")}
158
+
159
+ # Sort by citations and take top N
160
+ papers_sorted = sorted(papers, key=lambda x: -(x.get("citation_count") or 0))
161
+ selected = papers_sorted[:max_nodes]
162
+ selected_dois = {p["doi"] for p in selected if p.get("doi")}
163
+
164
+ # Create network
165
+ net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")
166
+ net.barnes_hut()
167
+
168
+ # Add nodes
169
+ for p in selected:
170
+ doi = p.get("doi")
171
+ if not doi:
172
+ continue
173
+
174
+ title = p.get("title", "No title")[:40]
175
+ citations = p.get("citation_count", 0)
176
+ year = p.get("year", "?")
177
+
178
+ # Size by citations (log scale)
179
+ size = 10 + min(citations, 500) ** 0.5 * 2
180
+
181
+ net.add_node(
182
+ doi,
183
+ label=f"{title}...\n({year})",
184
+ title=f"{p.get('title', 'No title')}\n{doi}\nCitations: {citations}",
185
+ size=size,
186
+ color="#3498db" if citations > 50 else "#95a5a6",
187
+ )
188
+
189
+ # Add edges from references
190
+ edge_count = 0
191
+ for p in selected:
192
+ doi = p.get("doi")
193
+ refs = p.get("references", [])
194
+ if not doi or not refs:
195
+ continue
196
+
197
+ for ref in refs:
198
+ if ref in selected_dois:
199
+ # Both papers in cache
200
+ net.add_edge(doi, ref)
201
+ edge_count += 1
202
+ elif include_external and ref not in selected_dois:
203
+ # External reference
204
+ if ref not in [n["id"] for n in net.nodes]:
205
+ net.add_node(
206
+ ref,
207
+ label=ref[:20],
208
+ title=f"External: {ref}",
209
+ size=5,
210
+ color="#e74c3c",
211
+ )
212
+ net.add_edge(doi, ref)
213
+ edge_count += 1
214
+
215
+ result = {
216
+ "nodes": len(net.nodes),
217
+ "edges": edge_count,
218
+ "cache_papers": len(papers),
219
+ "selected_papers": len(selected),
220
+ }
221
+
222
+ if output:
223
+ net.save_graph(output)
224
+ result["output"] = output
225
+ else:
226
+ result["network_data"] = {
227
+ "nodes": [{"id": n["id"], "label": n["label"]} for n in net.nodes],
228
+ "edges": edge_count,
229
+ }
230
+
231
+ return result
232
+
233
+
234
+ def get_top_cited(
235
+ cache_name: str,
236
+ n: int = 20,
237
+ year_min: Optional[int] = None,
238
+ year_max: Optional[int] = None,
239
+ ) -> List[Dict[str, Any]]:
240
+ """Get top cited papers from cache.
241
+
242
+ Args:
243
+ cache_name: Name of cache
244
+ n: Number of papers to return
245
+ year_min: Filter by minimum year
246
+ year_max: Filter by maximum year
247
+
248
+ Returns:
249
+ List of paper dicts sorted by citation count
250
+ """
251
+ papers = cache.query(
252
+ cache_name,
253
+ include_citations=True,
254
+ year_min=year_min,
255
+ year_max=year_max,
256
+ )
257
+
258
+ # Sort by citations
259
+ papers_sorted = sorted(papers, key=lambda x: -(x.get("citation_count") or 0))
260
+ return papers_sorted[:n]
261
+
262
+
263
+ def get_citation_summary(cache_name: str) -> Dict[str, Any]:
264
+ """Get citation statistics summary.
265
+
266
+ Args:
267
+ cache_name: Name of cache
268
+
269
+ Returns:
270
+ Dict with citation statistics
271
+ """
272
+ papers = cache.load(cache_name)
273
+
274
+ citations = [
275
+ p.get("citation_count", 0)
276
+ for p in papers
277
+ if p.get("citation_count") is not None
278
+ ]
279
+
280
+ if not citations:
281
+ return {"error": "No citation data available"}
282
+
283
+ import statistics
284
+
285
+ return {
286
+ "total_papers": len(papers),
287
+ "papers_with_citations": len(citations),
288
+ "total_citations": sum(citations),
289
+ "mean": round(statistics.mean(citations), 2),
290
+ "median": statistics.median(citations),
291
+ "stdev": round(statistics.stdev(citations), 2) if len(citations) > 1 else 0,
292
+ "max": max(citations),
293
+ "min": min(citations),
294
+ "highly_cited_50": sum(1 for c in citations if c >= 50),
295
+ "highly_cited_100": sum(1 for c in citations if c >= 100),
296
+ }
File without changes