scholarimpact 0.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. scholarimpact/__init__.py +80 -0
  2. scholarimpact/_version.py +21 -0
  3. scholarimpact/assets/README.md +36 -0
  4. scholarimpact/assets/__init__.py +136 -0
  5. scholarimpact/assets/fonts/OFL-SpaceGrotesk.txt +93 -0
  6. scholarimpact/assets/fonts/OFL-SpaceMono.txt +93 -0
  7. scholarimpact/assets/fonts/SpaceGrotesk-SemiBold.ttf +0 -0
  8. scholarimpact/assets/fonts/SpaceGrotesk-VariableFont_wght.ttf +0 -0
  9. scholarimpact/assets/fonts/SpaceMono-Bold.ttf +0 -0
  10. scholarimpact/assets/fonts/SpaceMono-BoldItalic.ttf +0 -0
  11. scholarimpact/assets/fonts/SpaceMono-Italic.ttf +0 -0
  12. scholarimpact/assets/fonts/SpaceMono-Regular.ttf +0 -0
  13. scholarimpact/assets/streamlit/config.toml +55 -0
  14. scholarimpact/cli/__init__.py +5 -0
  15. scholarimpact/cli/commands/__init__.py +5 -0
  16. scholarimpact/cli/commands/crawl.py +115 -0
  17. scholarimpact/cli/commands/dashboard.py +49 -0
  18. scholarimpact/cli/commands/extract.py +45 -0
  19. scholarimpact/cli/commands/generate.py +126 -0
  20. scholarimpact/cli/main.py +78 -0
  21. scholarimpact/core/__init__.py +6 -0
  22. scholarimpact/core/crawler.py +1455 -0
  23. scholarimpact/core/extractor.py +173 -0
  24. scholarimpact/core/utils.py +429 -0
  25. scholarimpact/dashboard/__init__.py +6 -0
  26. scholarimpact/dashboard/app.py +298 -0
  27. scholarimpact/dashboard/components/__init__.py +17 -0
  28. scholarimpact/dashboard/components/base.py +121 -0
  29. scholarimpact/dashboard/components/config.py +142 -0
  30. scholarimpact/dashboard/components/layout.py +178 -0
  31. scholarimpact/dashboard/components/streamlit_app.py +1280 -0
  32. scholarimpact-0.0.1.dev1.dist-info/METADATA +436 -0
  33. scholarimpact-0.0.1.dev1.dist-info/RECORD +37 -0
  34. scholarimpact-0.0.1.dev1.dist-info/WHEEL +5 -0
  35. scholarimpact-0.0.1.dev1.dist-info/entry_points.txt +2 -0
  36. scholarimpact-0.0.1.dev1.dist-info/licenses/LICENSE +21 -0
  37. scholarimpact-0.0.1.dev1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,80 @@
1
+ """
2
+ ScholarImpact: Citation Analysis and Dashboard Package
3
+
4
+ A comprehensive tool for analyzing Google Scholar citations with geographic
5
+ and institutional insights, featuring interactive Streamlit dashboards.
6
+ """
7
+
8
+ from .core.crawler import CitationCrawler
9
+ from .core.extractor import AuthorExtractor
10
+ from .dashboard.app import Dashboard
11
+ from .data.loader import load_data
12
+
13
+ # Version handling with setuptools-scm
14
+ try:
15
+ from ._version import version as __version__
16
+ from ._version import version_tuple
17
+ except ImportError:
18
+ # Fallback for development or when _version.py doesn't exist
19
+ __version__ = "0.0.0+unknown"
20
+ version_tuple = (0, 0, 0, "unknown", "unknown")
21
+
22
+ __author__ = "Abhishek Tiwari"
23
+ __email__ = "schoscholarimpact@abhishek-tiwari.com"
24
+
25
+
26
+ # Main convenience functions
27
+ def extract_author(scholar_id, **kwargs):
28
+ """Extract author publications from Google Scholar."""
29
+ extractor = AuthorExtractor()
30
+ return extractor.extract(scholar_id, **kwargs)
31
+
32
+
33
+ def crawl_citations(url_or_data, **kwargs):
34
+ """Crawl citations from Google Scholar."""
35
+ crawler = CitationCrawler(**kwargs)
36
+ return crawler.crawl_all_citations(url_or_data)
37
+
38
+
39
+ def create_dashboard(data_dir, **kwargs):
40
+ """Create a Streamlit dashboard for citation analysis."""
41
+ return Dashboard(data_dir=data_dir, **kwargs)
42
+
43
+
44
+ # Quick start function
45
+ def quick_analysis(scholar_id, openalex_email=None, launch_dashboard=True, data_dir="./data"):
46
+ """Complete analysis pipeline from Scholar ID to dashboard."""
47
+ import os
48
+
49
+ # Set default data directory
50
+ os.makedirs(data_dir, exist_ok=True)
51
+
52
+ # Extract author data
53
+ print(f"Extracting author data for {scholar_id}...")
54
+ author_data = extract_author(scholar_id, output_dir=data_dir)
55
+
56
+ # Crawl citations
57
+ print("Crawling citations...")
58
+ citation_data = crawl_citations(f"{data_dir}/author.json", openalex_email=openalex_email)
59
+
60
+ # Create and optionally launch dashboard
61
+ print("Creating dashboard...")
62
+ dashboard = create_dashboard(data_dir)
63
+
64
+ if launch_dashboard:
65
+ print("Launching dashboard...")
66
+ dashboard.run()
67
+
68
+ return {"author": author_data, "citations": citation_data, "dashboard": dashboard}
69
+
70
+
71
+ __all__ = [
72
+ "CitationCrawler",
73
+ "AuthorExtractor",
74
+ "Dashboard",
75
+ "load_data",
76
+ "extract_author",
77
+ "crawl_citations",
78
+ "create_dashboard",
79
+ "quick_analysis",
80
+ ]
@@ -0,0 +1,21 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
6
+ TYPE_CHECKING = False
7
+ if TYPE_CHECKING:
8
+ from typing import Tuple
9
+ from typing import Union
10
+
11
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
12
+ else:
13
+ VERSION_TUPLE = object
14
+
15
+ version: str
16
+ __version__: str
17
+ __version_tuple__: VERSION_TUPLE
18
+ version_tuple: VERSION_TUPLE
19
+
20
+ __version__ = version = '0.0.1.dev1'
21
+ __version_tuple__ = version_tuple = (0, 0, 1, 'dev1')
@@ -0,0 +1,36 @@
1
+ # ScholarImpact Assets
2
+
3
+ This directory contains bundled assets for the ScholarImpact package.
4
+
5
+ ## Files
6
+
7
+ - `config.toml` - Default Streamlit configuration with optimized settings
8
+ - Font files (*.ttf, *.otf, *.woff) - Custom fonts for dashboard theming
9
+
10
+ ## Usage
11
+
12
+ Assets are automatically copied when using:
13
+
14
+ ```bash
15
+ scholarimpact generate-dashboard
16
+ ```
17
+
18
+ Or programmatically:
19
+
20
+ ```python
21
+ from scholarimpact.assets import copy_streamlit_config, copy_fonts
22
+
23
+ # Copy config
24
+ copy_streamlit_config('.streamlit/')
25
+
26
+ # Copy fonts
27
+ copy_fonts('.streamlit/')
28
+ ```
29
+
30
+ ## Adding Custom Assets
31
+
32
+ To add custom fonts or configurations:
33
+
34
+ 1. Place font files in this directory
35
+ 2. Update config.toml as needed
36
+ 3. Reinstall package: `pip install -e .`
@@ -0,0 +1,136 @@
1
+ """
2
+ Assets module for ScholarImpact package.
3
+
4
+ Contains bundled configuration files, fonts, and other static assets.
5
+ """
6
+
7
+ import shutil
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from importlib.resources import files
12
+
13
+
14
+ def get_asset_path(asset_name: str) -> Optional[Path]:
15
+ """
16
+ Get path to bundled asset.
17
+
18
+ Args:
19
+ asset_name: Name of asset file
20
+
21
+ Returns:
22
+ Path to asset or None if not found
23
+ """
24
+ try:
25
+ # Try to get asset from package using importlib.resources
26
+ assets_pkg = files("scholarimpact.assets")
27
+ asset_ref = assets_pkg / asset_name
28
+ if asset_ref.is_file():
29
+ # Return a Path object that can be used directly
30
+ return Path(str(asset_ref))
31
+ except Exception:
32
+ pass
33
+
34
+ # Fallback to local assets
35
+ assets_dir = Path(__file__).parent
36
+ asset_file = assets_dir / asset_name
37
+ if asset_file.exists():
38
+ return asset_file
39
+
40
+ return None
41
+
42
+
43
+ def copy_streamlit_config(output_dir: str, config_name: str = "config.toml") -> bool:
44
+ """
45
+ Copy bundled Streamlit config to output directory.
46
+
47
+ Args:
48
+ output_dir: Output directory path
49
+ config_name: Config file name (default: config.toml)
50
+
51
+ Returns:
52
+ True if successful, False otherwise
53
+ """
54
+ config_path = get_asset_path(config_name)
55
+ if not config_path:
56
+ return False
57
+
58
+ output_path = Path(output_dir)
59
+ output_path.mkdir(parents=True, exist_ok=True)
60
+
61
+ try:
62
+ # Extract just the filename if config_name has a path
63
+ target_name = Path(config_name).name
64
+ shutil.copy2(config_path, output_path / target_name)
65
+ return True
66
+ except Exception:
67
+ return False
68
+
69
+
70
+ def copy_fonts(output_dir: str) -> int:
71
+ """
72
+ Copy bundled fonts and license files to static directory.
73
+
74
+ Args:
75
+ output_dir: Output directory path (should be .streamlit directory)
76
+
77
+ Returns:
78
+ Number of files copied (fonts + licenses)
79
+ """
80
+ assets_dir = Path(__file__).parent
81
+ fonts_dir = assets_dir / "fonts"
82
+
83
+ # Look for font files and license files in the fonts subdirectory
84
+ font_files = []
85
+ license_files = []
86
+ if fonts_dir.exists():
87
+ font_files = (
88
+ list(fonts_dir.glob("*.ttf"))
89
+ + list(fonts_dir.glob("*.otf"))
90
+ + list(fonts_dir.glob("*.woff*"))
91
+ )
92
+ license_files = list(fonts_dir.glob("*.txt")) # License files
93
+
94
+ if not font_files and not license_files:
95
+ return 0
96
+
97
+ # Create static directory structure
98
+ output_path = Path(output_dir)
99
+ if output_path.name == ".streamlit":
100
+ # Create static at parent level (alongside .streamlit)
101
+ static_dir = output_path.parent / "static"
102
+ else:
103
+ # Fallback: create static at output directory level
104
+ static_dir = output_path.parent / "static"
105
+ static_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ copied = 0
108
+
109
+ # Copy font files to static directory
110
+ for font_file in font_files:
111
+ try:
112
+ shutil.copy2(font_file, static_dir / font_file.name)
113
+ copied += 1
114
+ except Exception:
115
+ continue
116
+
117
+ # Copy license files to static directory
118
+ for license_file in license_files:
119
+ try:
120
+ shutil.copy2(license_file, static_dir / license_file.name)
121
+ copied += 1
122
+ except Exception:
123
+ continue
124
+
125
+ return copied
126
+
127
+
128
+ def list_assets() -> list:
129
+ """
130
+ List all available bundled assets.
131
+
132
+ Returns:
133
+ List of asset file names
134
+ """
135
+ assets_dir = Path(__file__).parent
136
+ return [f.name for f in assets_dir.iterdir() if f.is_file() and f.name != "__init__.py"]
@@ -0,0 +1,93 @@
1
+ Copyright 2020 The Space Grotesk Project Authors (https://github.com/floriankarsten/space-grotesk)
2
+
3
+ This Font Software is licensed under the SIL Open Font License, Version 1.1.
4
+ This license is copied below, and is also available with a FAQ at:
5
+ https://openfontlicense.org
6
+
7
+
8
+ -----------------------------------------------------------
9
+ SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
10
+ -----------------------------------------------------------
11
+
12
+ PREAMBLE
13
+ The goals of the Open Font License (OFL) are to stimulate worldwide
14
+ development of collaborative font projects, to support the font creation
15
+ efforts of academic and linguistic communities, and to provide a free and
16
+ open framework in which fonts may be shared and improved in partnership
17
+ with others.
18
+
19
+ The OFL allows the licensed fonts to be used, studied, modified and
20
+ redistributed freely as long as they are not sold by themselves. The
21
+ fonts, including any derivative works, can be bundled, embedded,
22
+ redistributed and/or sold with any software provided that any reserved
23
+ names are not used by derivative works. The fonts and derivatives,
24
+ however, cannot be released under any other type of license. The
25
+ requirement for fonts to remain under this license does not apply
26
+ to any document created using the fonts or their derivatives.
27
+
28
+ DEFINITIONS
29
+ "Font Software" refers to the set of files released by the Copyright
30
+ Holder(s) under this license and clearly marked as such. This may
31
+ include source files, build scripts and documentation.
32
+
33
+ "Reserved Font Name" refers to any names specified as such after the
34
+ copyright statement(s).
35
+
36
+ "Original Version" refers to the collection of Font Software components as
37
+ distributed by the Copyright Holder(s).
38
+
39
+ "Modified Version" refers to any derivative made by adding to, deleting,
40
+ or substituting -- in part or in whole -- any of the components of the
41
+ Original Version, by changing formats or by porting the Font Software to a
42
+ new environment.
43
+
44
+ "Author" refers to any designer, engineer, programmer, technical
45
+ writer or other person who contributed to the Font Software.
46
+
47
+ PERMISSION & CONDITIONS
48
+ Permission is hereby granted, free of charge, to any person obtaining
49
+ a copy of the Font Software, to use, study, copy, merge, embed, modify,
50
+ redistribute, and sell modified and unmodified copies of the Font
51
+ Software, subject to the following conditions:
52
+
53
+ 1) Neither the Font Software nor any of its individual components,
54
+ in Original or Modified Versions, may be sold by itself.
55
+
56
+ 2) Original or Modified Versions of the Font Software may be bundled,
57
+ redistributed and/or sold with any software, provided that each copy
58
+ contains the above copyright notice and this license. These can be
59
+ included either as stand-alone text files, human-readable headers or
60
+ in the appropriate machine-readable metadata fields within text or
61
+ binary files as long as those fields can be easily viewed by the user.
62
+
63
+ 3) No Modified Version of the Font Software may use the Reserved Font
64
+ Name(s) unless explicit written permission is granted by the corresponding
65
+ Copyright Holder. This restriction only applies to the primary font name as
66
+ presented to the users.
67
+
68
+ 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
69
+ Software shall not be used to promote, endorse or advertise any
70
+ Modified Version, except to acknowledge the contribution(s) of the
71
+ Copyright Holder(s) and the Author(s) or with their explicit written
72
+ permission.
73
+
74
+ 5) The Font Software, modified or unmodified, in part or in whole,
75
+ must be distributed entirely under this license, and must not be
76
+ distributed under any other license. The requirement for fonts to
77
+ remain under this license does not apply to any document created
78
+ using the Font Software.
79
+
80
+ TERMINATION
81
+ This license becomes null and void if any of the above conditions are
82
+ not met.
83
+
84
+ DISCLAIMER
85
+ THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
87
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
88
+ OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
89
+ COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90
+ INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
91
+ DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92
+ FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
93
+ OTHER DEALINGS IN THE FONT SOFTWARE.
@@ -0,0 +1,93 @@
1
+ Copyright 2016 The Space Mono Project Authors (https://github.com/googlefonts/spacemono)
2
+
3
+ This Font Software is licensed under the SIL Open Font License, Version 1.1.
4
+ This license is copied below, and is also available with a FAQ at:
5
+ https://openfontlicense.org
6
+
7
+
8
+ -----------------------------------------------------------
9
+ SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
10
+ -----------------------------------------------------------
11
+
12
+ PREAMBLE
13
+ The goals of the Open Font License (OFL) are to stimulate worldwide
14
+ development of collaborative font projects, to support the font creation
15
+ efforts of academic and linguistic communities, and to provide a free and
16
+ open framework in which fonts may be shared and improved in partnership
17
+ with others.
18
+
19
+ The OFL allows the licensed fonts to be used, studied, modified and
20
+ redistributed freely as long as they are not sold by themselves. The
21
+ fonts, including any derivative works, can be bundled, embedded,
22
+ redistributed and/or sold with any software provided that any reserved
23
+ names are not used by derivative works. The fonts and derivatives,
24
+ however, cannot be released under any other type of license. The
25
+ requirement for fonts to remain under this license does not apply
26
+ to any document created using the fonts or their derivatives.
27
+
28
+ DEFINITIONS
29
+ "Font Software" refers to the set of files released by the Copyright
30
+ Holder(s) under this license and clearly marked as such. This may
31
+ include source files, build scripts and documentation.
32
+
33
+ "Reserved Font Name" refers to any names specified as such after the
34
+ copyright statement(s).
35
+
36
+ "Original Version" refers to the collection of Font Software components as
37
+ distributed by the Copyright Holder(s).
38
+
39
+ "Modified Version" refers to any derivative made by adding to, deleting,
40
+ or substituting -- in part or in whole -- any of the components of the
41
+ Original Version, by changing formats or by porting the Font Software to a
42
+ new environment.
43
+
44
+ "Author" refers to any designer, engineer, programmer, technical
45
+ writer or other person who contributed to the Font Software.
46
+
47
+ PERMISSION & CONDITIONS
48
+ Permission is hereby granted, free of charge, to any person obtaining
49
+ a copy of the Font Software, to use, study, copy, merge, embed, modify,
50
+ redistribute, and sell modified and unmodified copies of the Font
51
+ Software, subject to the following conditions:
52
+
53
+ 1) Neither the Font Software nor any of its individual components,
54
+ in Original or Modified Versions, may be sold by itself.
55
+
56
+ 2) Original or Modified Versions of the Font Software may be bundled,
57
+ redistributed and/or sold with any software, provided that each copy
58
+ contains the above copyright notice and this license. These can be
59
+ included either as stand-alone text files, human-readable headers or
60
+ in the appropriate machine-readable metadata fields within text or
61
+ binary files as long as those fields can be easily viewed by the user.
62
+
63
+ 3) No Modified Version of the Font Software may use the Reserved Font
64
+ Name(s) unless explicit written permission is granted by the corresponding
65
+ Copyright Holder. This restriction only applies to the primary font name as
66
+ presented to the users.
67
+
68
+ 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
69
+ Software shall not be used to promote, endorse or advertise any
70
+ Modified Version, except to acknowledge the contribution(s) of the
71
+ Copyright Holder(s) and the Author(s) or with their explicit written
72
+ permission.
73
+
74
+ 5) The Font Software, modified or unmodified, in part or in whole,
75
+ must be distributed entirely under this license, and must not be
76
+ distributed under any other license. The requirement for fonts to
77
+ remain under this license does not apply to any document created
78
+ using the Font Software.
79
+
80
+ TERMINATION
81
+ This license becomes null and void if any of the above conditions are
82
+ not met.
83
+
84
+ DISCLAIMER
85
+ THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
87
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
88
+ OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
89
+ COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90
+ INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
91
+ DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92
+ FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
93
+ OTHER DEALINGS IN THE FONT SOFTWARE.
@@ -0,0 +1,55 @@
1
+ [server]
2
+ enableStaticServing = true
3
+
4
+ [[theme.fontFaces]]
5
+ family = "SpaceGrotesk"
6
+ url = "app/static/SpaceGrotesk-VariableFont_wght.ttf"
7
+
8
+ [[theme.fontFaces]]
9
+ family = "SpaceMono"
10
+ url = "app/static/SpaceMono-Bold.ttf"
11
+ style = "normal"
12
+ weight = 700
13
+
14
+ [[theme.fontFaces]]
15
+ family = "SpaceMono"
16
+ url = "app/static/SpaceMono-BoldItalic.ttf"
17
+ style = "italic"
18
+ weight = 700
19
+
20
+ [[theme.fontFaces]]
21
+ family = "SpaceMono"
22
+ url = "app/static/SpaceMono-Italic.ttf"
23
+ style = "italic"
24
+ weight = 400
25
+
26
+ [[theme.fontFaces]]
27
+ family = "SpaceMono"
28
+ url = "app/static/SpaceMono-Regular.ttf"
29
+ style = "normal"
30
+ weight = 400
31
+
32
+ [theme]
33
+ primaryColor = "#cb785c"
34
+ backgroundColor = "#fdfdf8"
35
+ secondaryBackgroundColor = "#ecebe3"
36
+ textColor = "#3d3a2a"
37
+ linkColor = "#3d3a2a"
38
+ borderColor = "#d3d2ca"
39
+ showWidgetBorder = true
40
+ baseRadius = "0.75rem"
41
+ buttonRadius = "full"
42
+ font = "SpaceGrotesk"
43
+ headingFontWeights = [600,500,500,500,500,500]
44
+ headingFontSizes = ["3rem", "2rem"]
45
+ codeFont = "SpaceMono"
46
+ codeFontSize = ".75rem"
47
+ codeBackgroundColor = "#ecebe4"
48
+ showSidebarBorder = true
49
+ chartCategoricalColors = ["#0ea5e9", "#059669", "#fbbf24"]
50
+
51
+ [theme.sidebar]
52
+ backgroundColor = "#f0f0ec"
53
+ secondaryBackgroundColor = "#ecebe3"
54
+ headingFontSizes = ["1.6rem", "1.4rem", "1.2rem"]
55
+ dataframeHeaderBackgroundColor = "#e4e4e0"
@@ -0,0 +1,5 @@
1
+ """CLI module for ScholarImpact."""
2
+
3
+ from .main import cli
4
+
5
+ __all__ = ["cli"]
@@ -0,0 +1,5 @@
1
+ """CLI commands for ScholarImpact."""
2
+
3
+ from . import crawl, dashboard, extract, generate
4
+
5
+ __all__ = ["extract", "crawl", "dashboard", "generate"]
@@ -0,0 +1,115 @@
1
+ """Crawl citations command for CLI."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from ...core.crawler import CitationCrawler
9
+
10
+
11
+ @click.command(name="crawl-citations")
12
+ @click.argument("author_json")
13
+ @click.option("--openalex-email", help="OpenAlex email for enhanced data")
14
+ @click.option("--max-citations", type=int, help="Maximum citations per paper")
15
+ @click.option(
16
+ "--delay-min", default=5.0, type=float, help="Minimum delay between requests (default: 5.0)"
17
+ )
18
+ @click.option(
19
+ "--delay-max", default=10.0, type=float, help="Maximum delay between requests (default: 10.0)"
20
+ )
21
+ @click.option(
22
+ "--delay-between-articles-min",
23
+ default=16.0,
24
+ type=float,
25
+ help="Minimum delay between articles (default: 16.0)",
26
+ )
27
+ @click.option(
28
+ "--delay-between-articles-max",
29
+ default=22.0,
30
+ type=float,
31
+ help="Maximum delay between articles (default: 22.0)",
32
+ )
33
+ @click.option("--output-dir", help="Output directory (defaults to author.json directory)")
34
+ def crawl_citations(
35
+ author_json,
36
+ openalex_email,
37
+ max_citations,
38
+ delay_min,
39
+ delay_max,
40
+ delay_between_articles_min,
41
+ delay_between_articles_max,
42
+ output_dir,
43
+ ):
44
+ """Crawl citations for publications in author.json file."""
45
+
46
+ click.echo(f"Loading author data from: {author_json}")
47
+
48
+ # Load author data
49
+ try:
50
+ with open(author_json, "r", encoding="utf-8") as f:
51
+ author_data = json.load(f)
52
+ except FileNotFoundError:
53
+ raise click.ClickException(f"Author file not found: {author_json}")
54
+ except json.JSONDecodeError:
55
+ raise click.ClickException(f"Invalid JSON in author file: {author_json}")
56
+
57
+ # Determine output directory
58
+ if not output_dir:
59
+ output_dir = str(Path(author_json).parent)
60
+
61
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
62
+
63
+ # Get articles from author data
64
+ articles = author_data.get("articles", [])
65
+ if not articles:
66
+ click.echo("No articles found in author data")
67
+ return
68
+
69
+ click.echo(f"Found {len(articles)} articles to process")
70
+
71
+ # Initialize crawler
72
+ delay_range = (delay_min, delay_max)
73
+ crawler = CitationCrawler(delay_range=delay_range, openalex_email=openalex_email)
74
+
75
+ # Process each article
76
+ processed = 0
77
+ skipped = 0
78
+ errors = 0
79
+
80
+ with click.progressbar(articles, label="Crawling citations") as article_bar:
81
+ for article in article_bar:
82
+ cites_id = article.get("cites_id")
83
+ if not cites_id:
84
+ skipped += 1
85
+ continue
86
+
87
+ # Check if already processed
88
+ output_file = Path(output_dir) / f"cites-{cites_id.replace(',', '_')}.json"
89
+ if output_file.exists():
90
+ skipped += 1
91
+ continue
92
+
93
+ try:
94
+ # Crawl citations
95
+ citations = crawler.crawl_all_citations(
96
+ cites_id, max_pages=None # max_citations is handled differently
97
+ )
98
+
99
+ # Save citations to file
100
+ if citations:
101
+ with open(output_file, "w", encoding="utf-8") as f:
102
+ json.dump(citations, f, ensure_ascii=False, indent=2)
103
+
104
+ processed += 1
105
+
106
+ except Exception as e:
107
+ click.echo(f"\nError processing {article.get('title', 'Unknown')}: {e}")
108
+ errors += 1
109
+
110
+ # Summary
111
+ click.echo(f"\n Citation crawling complete!")
112
+ click.echo(f" Processed: {processed}")
113
+ click.echo(f" Skipped (no ID or exists): {skipped}")
114
+ if errors > 0:
115
+ click.echo(f" Errors: {errors}")