scholarcli 1.15__tar.gz → 1.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scholarcli-1.15/src/scholarcli.egg-info → scholarcli-1.20}/PKG-INFO +1 -1
- {scholarcli-1.15 → scholarcli-1.20}/pyproject.toml +1 -1
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/cli.py +37 -4
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/providers.py +253 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/tui.py +13 -1
- {scholarcli-1.15 → scholarcli-1.20/src/scholarcli.egg-info}/PKG-INFO +1 -1
- {scholarcli-1.15 → scholarcli-1.20}/src/scholarcli.egg-info/SOURCES.txt +3 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/aggregator.py +1 -1
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/google_scholar.py +6 -3
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/cli.py +100 -73
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/parsers/pdf_parser.py +223 -5
- scholarcli-1.20/src/snowball/services.py +337 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/snowballing.py +32 -5
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/storage/json_storage.py +9 -2
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/tui/app.py +122 -284
- scholarcli-1.20/src/snowball/tui/dialogs.py +212 -0
- scholarcli-1.20/src/snowball/tui/setup.py +325 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_cli.py +70 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_providers.py +302 -0
- scholarcli-1.20/tests/test_tui.py +31 -0
- scholarcli-1.15/tests/test_tui.py +0 -11
- {scholarcli-1.15 → scholarcli-1.20}/LICENSE +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/README.md +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/setup.cfg +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/__main__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/cache.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/enrich.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/llm_review.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/notes.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/pdf.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/questionary.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/review.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/scholar.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholar/utils.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholarcli.egg-info/dependency_links.txt +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholarcli.egg-info/entry_points.txt +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholarcli.egg-info/requires.txt +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/scholarcli.egg-info/top_level.txt +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/arxiv.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/base.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/crossref.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/openalex.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/opencitations.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/apis/semantic_scholar.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/exporters/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/exporters/bibtex.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/exporters/csv_exporter.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/exporters/tikz.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/filters/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/filters/filter_engine.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/models.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/paper_utils.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/parsers/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/scoring/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/scoring/base.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/scoring/llm_scorer.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/scoring/tfidf_scorer.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/storage/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/tui/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/snowball/visualization.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/__init__.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/analysis.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/cli.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/clustering.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/database.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/grobid.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/logging.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/models.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/project.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/src/tuxedo/tui.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_cache.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_enrich.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_llm_review.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_notes.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_pdf.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_review.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_scholar.py +0 -0
- {scholarcli-1.15 → scholarcli-1.20}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "scholarcli"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.20"
|
|
4
4
|
description = "A tool for structured literature searches across bibliographic databases"
|
|
5
5
|
authors = [{ name = "Daniel Bosk", email = "dbosk@kth.se" },
|
|
6
6
|
{ name = "Ric Glassey", email = "glassey@kth.se" }]
|
|
@@ -1083,6 +1083,11 @@ def providers() -> None:
|
|
|
1083
1083
|
"env_var": "IEEE_API_KEY",
|
|
1084
1084
|
"how_to_get": "developer.ieee.org",
|
|
1085
1085
|
},
|
|
1086
|
+
"scopus": {
|
|
1087
|
+
"required": True,
|
|
1088
|
+
"env_var": "SCOPUS_API_KEY",
|
|
1089
|
+
"how_to_get": "dev.elsevier.com",
|
|
1090
|
+
},
|
|
1086
1091
|
}
|
|
1087
1092
|
|
|
1088
1093
|
for provider in get_all_providers():
|
|
@@ -1181,6 +1186,14 @@ def syntax() -> None:
|
|
|
1181
1186
|
"[green]✓[/]",
|
|
1182
1187
|
"Must be UPPERCASE, supports ONEAR",
|
|
1183
1188
|
)
|
|
1189
|
+
table.add_row(
|
|
1190
|
+
"scopus",
|
|
1191
|
+
"[green]✓[/]",
|
|
1192
|
+
"[green]✓[/]",
|
|
1193
|
+
"[green]✓[/]",
|
|
1194
|
+
"[green]✓[/]",
|
|
1195
|
+
"Must be UPPERCASE, supports W/n PRE/n proximity",
|
|
1196
|
+
)
|
|
1184
1197
|
table.add_row(
|
|
1185
1198
|
"arxiv",
|
|
1186
1199
|
"[green]✓[/]",
|
|
@@ -1236,6 +1249,13 @@ def syntax() -> None:
|
|
|
1236
1249
|
"[green]✓[/]",
|
|
1237
1250
|
"Max 5 wildcards per search",
|
|
1238
1251
|
)
|
|
1252
|
+
table2.add_row(
|
|
1253
|
+
"scopus",
|
|
1254
|
+
"[green]✓[/] * ?",
|
|
1255
|
+
'[green]✓[/] "..."',
|
|
1256
|
+
"[green]✓[/] TITLE() AUTH()",
|
|
1257
|
+
"TITLE-ABS-KEY(), AUTH(), SRCTITLE()",
|
|
1258
|
+
)
|
|
1239
1259
|
table2.add_row(
|
|
1240
1260
|
"arxiv",
|
|
1241
1261
|
"[red]✗[/]",
|
|
@@ -1255,6 +1275,7 @@ def syntax() -> None:
|
|
|
1255
1275
|
("dblp", "machine learning privacy [space = AND]"),
|
|
1256
1276
|
("wos", 'TS=("machine learning" AND privacy)'),
|
|
1257
1277
|
("ieee", '"machine learning" AND privacy NOT survey'),
|
|
1278
|
+
("scopus", 'TITLE-ABS-KEY("machine learning" AND privacy)'),
|
|
1258
1279
|
("arxiv", 'ti:"machine learning" AND cat:cs.AI'),
|
|
1259
1280
|
]
|
|
1260
1281
|
for provider, example in examples:
|
|
@@ -1284,6 +1305,7 @@ def syntax() -> None:
|
|
|
1284
1305
|
"ieee",
|
|
1285
1306
|
"https://ieeexplore.ieee.org/Xplorehelp/searching-ieee-xplore/command-search",
|
|
1286
1307
|
),
|
|
1308
|
+
("scopus", "https://dev.elsevier.com/sc_search_tips.html"),
|
|
1287
1309
|
(
|
|
1288
1310
|
"arxiv",
|
|
1289
1311
|
"https://info.arxiv.org/help/api/user-manual.html#query_details",
|
|
@@ -2038,7 +2060,7 @@ def sessions_show(
|
|
|
2038
2060
|
con.print()
|
|
2039
2061
|
|
|
2040
2062
|
# Show kept papers
|
|
2041
|
-
kept = session.kept_papers
|
|
2063
|
+
kept = session.kept_papers()
|
|
2042
2064
|
if kept:
|
|
2043
2065
|
con.print(f"[green bold]Kept ({len(kept)}):[/green bold]")
|
|
2044
2066
|
for d in kept:
|
|
@@ -2055,7 +2077,7 @@ def sessions_show(
|
|
|
2055
2077
|
con.print()
|
|
2056
2078
|
|
|
2057
2079
|
# Show discarded papers
|
|
2058
|
-
discarded = session.discarded_papers
|
|
2080
|
+
discarded = session.discarded_papers()
|
|
2059
2081
|
if discarded:
|
|
2060
2082
|
con.print(
|
|
2061
2083
|
f"[red bold]Discarded ({len(discarded)}):[/red bold]"
|
|
@@ -2074,7 +2096,7 @@ def sessions_show(
|
|
|
2074
2096
|
con.print()
|
|
2075
2097
|
|
|
2076
2098
|
# Show pending papers
|
|
2077
|
-
pending = session.pending_papers
|
|
2099
|
+
pending = session.pending_papers()
|
|
2078
2100
|
if pending:
|
|
2079
2101
|
con.print(
|
|
2080
2102
|
f"[yellow bold]Pending ({len(pending)}):[/yellow bold]"
|
|
@@ -2543,15 +2565,25 @@ def llm_classify(
|
|
|
2543
2565
|
help="Skip automatic enrichment of papers without abstracts.",
|
|
2544
2566
|
),
|
|
2545
2567
|
] = False,
|
|
2568
|
+
no_examples: Annotated[
|
|
2569
|
+
bool,
|
|
2570
|
+
typer.Option(
|
|
2571
|
+
"--no-examples",
|
|
2572
|
+
help="Run without requiring tagged examples (zero-shot).",
|
|
2573
|
+
),
|
|
2574
|
+
] = False,
|
|
2546
2575
|
) -> None:
|
|
2547
2576
|
"""
|
|
2548
2577
|
Classify pending papers using LLM.
|
|
2549
2578
|
|
|
2550
2579
|
Uses human-reviewed papers as training examples. Requires at least
|
|
2551
|
-
5 tagged examples (minimum 1 kept, 1 discarded)
|
|
2580
|
+
5 tagged examples (minimum 1 kept, 1 discarded) unless --no-examples
|
|
2581
|
+
is given, which runs zero-shot classification using only the research
|
|
2582
|
+
context.
|
|
2552
2583
|
|
|
2553
2584
|
Example:
|
|
2554
2585
|
scholar llm classify "my review" --count 20
|
|
2586
|
+
scholar llm classify "my review" --no-examples
|
|
2555
2587
|
"""
|
|
2556
2588
|
import scholar.review as review
|
|
2557
2589
|
from scholar.review import save_session
|
|
@@ -2582,6 +2614,7 @@ def llm_classify(
|
|
|
2582
2614
|
model_id=select_model_id(model_selection, "analytic"),
|
|
2583
2615
|
enrich_missing=not no_enrich,
|
|
2584
2616
|
dry_run=dry_run,
|
|
2617
|
+
require_examples=not no_examples,
|
|
2585
2618
|
)
|
|
2586
2619
|
|
|
2587
2620
|
if dry_run:
|
|
@@ -43,6 +43,7 @@ WOS_STARTER_API_URL = (
|
|
|
43
43
|
WOS_EXPANDED_API_URL = "https://wos-api.clarivate.com/api/wos"
|
|
44
44
|
_WOS_NOT_PROVIDED = object() # Sentinel for "argument not passed"
|
|
45
45
|
IEEE_API_URL = "https://ieeexploreapi.ieee.org/api/v1/search/articles"
|
|
46
|
+
SCOPUS_API_URL = "https://api.elsevier.com/content/search/scopus"
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class SearchProvider(Protocol):
|
|
@@ -1683,6 +1684,8 @@ class WebOfScienceProvider:
|
|
|
1683
1684
|
"p",
|
|
1684
1685
|
default=None,
|
|
1685
1686
|
)
|
|
1687
|
+
if isinstance(abstract, list):
|
|
1688
|
+
abstract = "\n\n".join(str(p) for p in abstract if p) or None
|
|
1686
1689
|
|
|
1687
1690
|
# Extract venue (source title)
|
|
1688
1691
|
venue = None
|
|
@@ -2787,3 +2790,253 @@ class ArxivProvider:
|
|
|
2787
2790
|
|
|
2788
2791
|
# Register the provider on module import
|
|
2789
2792
|
register_provider(ArxivProvider())
|
|
2793
|
+
|
|
2794
|
+
|
|
2795
|
+
class ScopusProvider:
|
|
2796
|
+
"""Search provider for Elsevier Scopus."""
|
|
2797
|
+
|
|
2798
|
+
name = "scopus"
|
|
2799
|
+
MAX_LIMIT = 25 # Scopus returns max 25 per page
|
|
2800
|
+
|
|
2801
|
+
def __init__(
|
|
2802
|
+
self,
|
|
2803
|
+
api_key: str | None = None,
|
|
2804
|
+
inst_token: str | None = None,
|
|
2805
|
+
):
|
|
2806
|
+
"""Initialize the Scopus provider.
|
|
2807
|
+
|
|
2808
|
+
Args:
|
|
2809
|
+
api_key: API key for Scopus API. Falls back
|
|
2810
|
+
to SCOPUS_API_KEY environment variable.
|
|
2811
|
+
inst_token: Institutional token for extended
|
|
2812
|
+
access. Falls back to SCOPUS_INST_TOKEN.
|
|
2813
|
+
"""
|
|
2814
|
+
self.api_key = api_key or os.environ.get("SCOPUS_API_KEY")
|
|
2815
|
+
self.inst_token = inst_token or os.environ.get("SCOPUS_INST_TOKEN")
|
|
2816
|
+
self._cache: dict = load_cache(self.name)
|
|
2817
|
+
register_cache(self.name, self._cache)
|
|
2818
|
+
|
|
2819
|
+
def is_available(self) -> bool:
|
|
2820
|
+
"""Scopus requires an API key."""
|
|
2821
|
+
return bool(self.api_key)
|
|
2822
|
+
|
|
2823
|
+
@cachedmethod(
|
|
2824
|
+
lambda self: self._cache,
|
|
2825
|
+
key=lambda self, query, limit=100, filters=None: (
|
|
2826
|
+
query,
|
|
2827
|
+
limit,
|
|
2828
|
+
filters.cache_key() if filters else "",
|
|
2829
|
+
),
|
|
2830
|
+
)
|
|
2831
|
+
def search(
|
|
2832
|
+
self,
|
|
2833
|
+
query: str,
|
|
2834
|
+
limit: int = 100,
|
|
2835
|
+
filters: SearchFilters | None = None,
|
|
2836
|
+
) -> list[Paper]:
|
|
2837
|
+
"""Search Scopus for papers matching the query.
|
|
2838
|
+
|
|
2839
|
+
Fetches multiple pages when the requested limit
|
|
2840
|
+
exceeds the per-request maximum of 25 results.
|
|
2841
|
+
"""
|
|
2842
|
+
if not self.api_key:
|
|
2843
|
+
return []
|
|
2844
|
+
|
|
2845
|
+
logger.debug(
|
|
2846
|
+
"scopus: Searching for '%s' with limit=%d",
|
|
2847
|
+
query,
|
|
2848
|
+
limit,
|
|
2849
|
+
)
|
|
2850
|
+
|
|
2851
|
+
search_query = query
|
|
2852
|
+
if filters:
|
|
2853
|
+
clauses = []
|
|
2854
|
+
|
|
2855
|
+
if filters.year:
|
|
2856
|
+
start_year, end_year = filters.year_range()
|
|
2857
|
+
if start_year and end_year:
|
|
2858
|
+
if start_year == end_year:
|
|
2859
|
+
clauses.append(f"PUBYEAR = {start_year}")
|
|
2860
|
+
else:
|
|
2861
|
+
clauses.append(
|
|
2862
|
+
f"PUBYEAR > {start_year - 1} "
|
|
2863
|
+
f"AND PUBYEAR < {end_year + 1}"
|
|
2864
|
+
)
|
|
2865
|
+
elif start_year:
|
|
2866
|
+
clauses.append(f"PUBYEAR > {start_year - 1}")
|
|
2867
|
+
elif end_year:
|
|
2868
|
+
clauses.append(f"PUBYEAR < {end_year + 1}")
|
|
2869
|
+
|
|
2870
|
+
if filters.open_access:
|
|
2871
|
+
clauses.append("OPENACCESS(1)")
|
|
2872
|
+
|
|
2873
|
+
if filters.venue:
|
|
2874
|
+
clauses.append(f"SRCTITLE({filters.venue})")
|
|
2875
|
+
|
|
2876
|
+
if filters.pub_types:
|
|
2877
|
+
type_mapping = {
|
|
2878
|
+
"article": "ar",
|
|
2879
|
+
"conference": "cp",
|
|
2880
|
+
"review": "re",
|
|
2881
|
+
"book": "bk",
|
|
2882
|
+
}
|
|
2883
|
+
doc_types = []
|
|
2884
|
+
for pt in filters.pub_types:
|
|
2885
|
+
mapped = type_mapping.get(pt.lower())
|
|
2886
|
+
if mapped:
|
|
2887
|
+
doc_types.append(mapped)
|
|
2888
|
+
else:
|
|
2889
|
+
logger.warning(
|
|
2890
|
+
"scopus: Publication type '%s' "
|
|
2891
|
+
"not supported, ignoring",
|
|
2892
|
+
pt,
|
|
2893
|
+
)
|
|
2894
|
+
if doc_types:
|
|
2895
|
+
dtype_clause = " OR ".join(
|
|
2896
|
+
f"DOCTYPE({dt})" for dt in doc_types
|
|
2897
|
+
)
|
|
2898
|
+
clauses.append(f"({dtype_clause})")
|
|
2899
|
+
|
|
2900
|
+
if filters.min_citations is not None:
|
|
2901
|
+
logger.warning(
|
|
2902
|
+
"scopus: Citation count filter " "not supported, ignoring"
|
|
2903
|
+
)
|
|
2904
|
+
|
|
2905
|
+
if clauses:
|
|
2906
|
+
search_query = f"({query}) AND " + " AND ".join(clauses)
|
|
2907
|
+
|
|
2908
|
+
all_papers: list[Paper] = []
|
|
2909
|
+
start = 0
|
|
2910
|
+
page_size = self.MAX_LIMIT
|
|
2911
|
+
|
|
2912
|
+
try:
|
|
2913
|
+
while len(all_papers) < limit:
|
|
2914
|
+
remaining = limit - len(all_papers)
|
|
2915
|
+
current_count = min(page_size, remaining)
|
|
2916
|
+
|
|
2917
|
+
headers = {
|
|
2918
|
+
"X-ELS-APIKey": self.api_key,
|
|
2919
|
+
"Accept": "application/json",
|
|
2920
|
+
}
|
|
2921
|
+
if self.inst_token:
|
|
2922
|
+
headers["X-ELS-Insttoken"] = self.inst_token
|
|
2923
|
+
|
|
2924
|
+
params = {
|
|
2925
|
+
"query": search_query,
|
|
2926
|
+
"start": start,
|
|
2927
|
+
"count": current_count,
|
|
2928
|
+
}
|
|
2929
|
+
|
|
2930
|
+
response = requests.get(
|
|
2931
|
+
SCOPUS_API_URL,
|
|
2932
|
+
headers=headers,
|
|
2933
|
+
params=params,
|
|
2934
|
+
timeout=30,
|
|
2935
|
+
)
|
|
2936
|
+
if response.status_code != 200:
|
|
2937
|
+
if response.status_code == 401:
|
|
2938
|
+
logger.warning(
|
|
2939
|
+
"scopus: Authentication failed (HTTP 401). "
|
|
2940
|
+
"Check your SCOPUS_API_KEY at "
|
|
2941
|
+
"https://dev.elsevier.com/"
|
|
2942
|
+
)
|
|
2943
|
+
elif response.status_code == 429:
|
|
2944
|
+
logger.warning(
|
|
2945
|
+
"scopus: Rate limited (HTTP 429). "
|
|
2946
|
+
"Wait before making more requests."
|
|
2947
|
+
)
|
|
2948
|
+
elif response.status_code == 403:
|
|
2949
|
+
logger.warning(
|
|
2950
|
+
"scopus: Access denied (HTTP 403). "
|
|
2951
|
+
"Your API key may lack Scopus Search "
|
|
2952
|
+
"permissions."
|
|
2953
|
+
)
|
|
2954
|
+
else:
|
|
2955
|
+
logger.warning(
|
|
2956
|
+
"scopus: API error (HTTP %d): %s",
|
|
2957
|
+
response.status_code,
|
|
2958
|
+
response.text[:200],
|
|
2959
|
+
)
|
|
2960
|
+
response.raise_for_status()
|
|
2961
|
+
data = response.json()
|
|
2962
|
+
|
|
2963
|
+
results = data.get("search-results", {})
|
|
2964
|
+
entries = results.get("entry", [])
|
|
2965
|
+
|
|
2966
|
+
if not entries or (
|
|
2967
|
+
len(entries) == 1 and entries[0].get("@_fa") == "false"
|
|
2968
|
+
):
|
|
2969
|
+
break
|
|
2970
|
+
|
|
2971
|
+
all_papers.extend(
|
|
2972
|
+
self._convert_entry(entry)
|
|
2973
|
+
for entry in entries
|
|
2974
|
+
if entry.get("@_fa") != "false"
|
|
2975
|
+
)
|
|
2976
|
+
start += len(entries)
|
|
2977
|
+
|
|
2978
|
+
total = int(results.get("opensearch:totalResults", 0))
|
|
2979
|
+
if start >= total:
|
|
2980
|
+
break
|
|
2981
|
+
|
|
2982
|
+
logger.debug(
|
|
2983
|
+
"scopus: Retrieved %d papers",
|
|
2984
|
+
len(all_papers),
|
|
2985
|
+
)
|
|
2986
|
+
return all_papers
|
|
2987
|
+
except requests.exceptions.HTTPError:
|
|
2988
|
+
return all_papers
|
|
2989
|
+
except Exception as e:
|
|
2990
|
+
logger.warning("scopus: %s", e)
|
|
2991
|
+
return all_papers
|
|
2992
|
+
|
|
2993
|
+
def _convert_entry(self, entry: dict) -> Paper:
|
|
2994
|
+
"""Convert a Scopus search entry to a Paper."""
|
|
2995
|
+
year = None
|
|
2996
|
+
cover_date = entry.get("prism:coverDate")
|
|
2997
|
+
if cover_date:
|
|
2998
|
+
try:
|
|
2999
|
+
year = int(cover_date[:4])
|
|
3000
|
+
except (ValueError, TypeError):
|
|
3001
|
+
pass
|
|
3002
|
+
|
|
3003
|
+
authors = []
|
|
3004
|
+
creator = entry.get("dc:creator")
|
|
3005
|
+
if creator:
|
|
3006
|
+
authors.append(creator)
|
|
3007
|
+
|
|
3008
|
+
url = None
|
|
3009
|
+
for link in entry.get("link", []):
|
|
3010
|
+
if link.get("@ref") == "scopus":
|
|
3011
|
+
url = link.get("@href")
|
|
3012
|
+
break
|
|
3013
|
+
|
|
3014
|
+
keywords = None
|
|
3015
|
+
auth_kw = entry.get("authkeywords")
|
|
3016
|
+
if auth_kw and isinstance(auth_kw, str):
|
|
3017
|
+
keywords = [kw.strip() for kw in auth_kw.split("|") if kw.strip()]
|
|
3018
|
+
|
|
3019
|
+
citation_count = None
|
|
3020
|
+
cited_by = entry.get("citedby-count")
|
|
3021
|
+
if cited_by is not None:
|
|
3022
|
+
try:
|
|
3023
|
+
citation_count = int(cited_by)
|
|
3024
|
+
except (ValueError, TypeError):
|
|
3025
|
+
pass
|
|
3026
|
+
|
|
3027
|
+
return Paper(
|
|
3028
|
+
title=entry.get("dc:title", "") or "",
|
|
3029
|
+
authors=authors,
|
|
3030
|
+
year=year,
|
|
3031
|
+
doi=entry.get("prism:doi"),
|
|
3032
|
+
abstract=entry.get("dc:description"),
|
|
3033
|
+
venue=entry.get("prism:publicationName"),
|
|
3034
|
+
url=url,
|
|
3035
|
+
citation_count=citation_count,
|
|
3036
|
+
keywords=keywords,
|
|
3037
|
+
sources=[self.name],
|
|
3038
|
+
)
|
|
3039
|
+
|
|
3040
|
+
|
|
3041
|
+
# Register the provider on module import
|
|
3042
|
+
register_provider(ScopusProvider())
|
|
@@ -153,6 +153,18 @@ class PaperListItem(ListItem):
|
|
|
153
153
|
)
|
|
154
154
|
|
|
155
155
|
|
|
156
|
+
def normalize_abstract(abstract) -> str:
|
|
157
|
+
"""Return abstract as a plain string, joining list paragraphs if needed.
|
|
158
|
+
|
|
159
|
+
Some providers return the abstract as a list of paragraph strings.
|
|
160
|
+
This helper joins them with blank lines so paragraph breaks are
|
|
161
|
+
preserved in the TUI display.
|
|
162
|
+
"""
|
|
163
|
+
if isinstance(abstract, list):
|
|
164
|
+
return "\n\n".join(str(p) for p in abstract if p)
|
|
165
|
+
return abstract or ""
|
|
166
|
+
|
|
167
|
+
|
|
156
168
|
class AbstractScreen(Screen[None]):
|
|
157
169
|
"""Full-screen view of a paper's abstract and details."""
|
|
158
170
|
|
|
@@ -217,7 +229,7 @@ class AbstractScreen(Screen[None]):
|
|
|
217
229
|
yield Static("")
|
|
218
230
|
yield Static("[bold]Abstract:[/bold]")
|
|
219
231
|
if paper.abstract:
|
|
220
|
-
yield Static(escape(paper.abstract))
|
|
232
|
+
yield Static(escape(normalize_abstract(paper.abstract)))
|
|
221
233
|
else:
|
|
222
234
|
if paper.pdf_url:
|
|
223
235
|
yield Static(
|
|
@@ -25,6 +25,7 @@ src/snowball/__init__.py
|
|
|
25
25
|
src/snowball/cli.py
|
|
26
26
|
src/snowball/models.py
|
|
27
27
|
src/snowball/paper_utils.py
|
|
28
|
+
src/snowball/services.py
|
|
28
29
|
src/snowball/snowballing.py
|
|
29
30
|
src/snowball/visualization.py
|
|
30
31
|
src/snowball/apis/__init__.py
|
|
@@ -52,6 +53,8 @@ src/snowball/storage/__init__.py
|
|
|
52
53
|
src/snowball/storage/json_storage.py
|
|
53
54
|
src/snowball/tui/__init__.py
|
|
54
55
|
src/snowball/tui/app.py
|
|
56
|
+
src/snowball/tui/dialogs.py
|
|
57
|
+
src/snowball/tui/setup.py
|
|
55
58
|
src/tuxedo/__init__.py
|
|
56
59
|
src/tuxedo/analysis.py
|
|
57
60
|
src/tuxedo/cli.py
|
|
@@ -206,7 +206,7 @@ class APIAggregator:
|
|
|
206
206
|
if "google_scholar" in self.clients and paper.title:
|
|
207
207
|
try:
|
|
208
208
|
# Google Scholar returns dicts, convert to Paper objects
|
|
209
|
-
gs_limit = min(limit,
|
|
209
|
+
gs_limit = min(limit, 20) # Keep Scholar fallback batches deliberately small.
|
|
210
210
|
gs_citations = self.clients["google_scholar"].get_citations(
|
|
211
211
|
paper.title, gs_limit
|
|
212
212
|
)
|
|
@@ -6,6 +6,8 @@ from typing import Optional, Tuple, List
|
|
|
6
6
|
|
|
7
7
|
logger = logging.getLogger(__name__)
|
|
8
8
|
|
|
9
|
+
DEFAULT_RATE_LIMIT_DELAY = 15.0
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
class GoogleScholarClient:
|
|
11
13
|
"""Client for fetching citation counts from Google Scholar.
|
|
@@ -17,7 +19,7 @@ class GoogleScholarClient:
|
|
|
17
19
|
|
|
18
20
|
def __init__(
|
|
19
21
|
self,
|
|
20
|
-
rate_limit_delay: float =
|
|
22
|
+
rate_limit_delay: float = DEFAULT_RATE_LIMIT_DELAY,
|
|
21
23
|
proxy: Optional[str] = None,
|
|
22
24
|
use_free_proxy: bool = False,
|
|
23
25
|
):
|
|
@@ -25,7 +27,8 @@ class GoogleScholarClient:
|
|
|
25
27
|
|
|
26
28
|
Args:
|
|
27
29
|
rate_limit_delay: Delay between requests in seconds.
|
|
28
|
-
Default is
|
|
30
|
+
Default is intentionally conservative because
|
|
31
|
+
Google Scholar is scraped, not an official API.
|
|
29
32
|
proxy: HTTP/HTTPS proxy URL (e.g., "http://user:pass@host:port")
|
|
30
33
|
use_free_proxy: Use free rotating proxies via free-proxy library
|
|
31
34
|
"""
|
|
@@ -186,7 +189,7 @@ class GoogleScholarClient:
|
|
|
186
189
|
|
|
187
190
|
return similarity >= threshold
|
|
188
191
|
|
|
189
|
-
def get_citations(self, title: str, limit: int =
|
|
192
|
+
def get_citations(self, title: str, limit: int = 20) -> List[dict]:
|
|
190
193
|
"""Get papers that cite a given paper (forward citations).
|
|
191
194
|
|
|
192
195
|
Args:
|