ietf-notebook 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Copyright (c) Mark Nottingham
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1 @@
1
+ include ietf_notebook/py.typed
@@ -0,0 +1,85 @@
1
+ Metadata-Version: 2.4
2
+ Name: ietf-notebook
3
+ Version: 0.1.0
4
+ Summary: Gather resources for an IETF Working Group for use in NotebookLM
5
+ Author-email: Mark Nottingham <mnot@mnot.net>
6
+ License: Copyright (c) Mark Nottingham
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in
16
+ all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ THE SOFTWARE.
25
+
26
+ Project-URL: homepage, https://github.com/mnot/ietf-notebook
27
+ Classifier: Operating System :: OS Independent
28
+ Classifier: Development Status :: 4 - Beta
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Requires-Python: >=3.9
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE.md
33
+ Requires-Dist: requests
34
+ Requires-Dist: beautifulsoup4
35
+ Provides-Extra: dev
36
+ Requires-Dist: mypy; extra == "dev"
37
+ Requires-Dist: black; extra == "dev"
38
+ Requires-Dist: pylint; extra == "dev"
39
+ Requires-Dist: pytest; extra == "dev"
40
+ Requires-Dist: pytest-md; extra == "dev"
41
+ Requires-Dist: validate-pyproject; extra == "dev"
42
+ Requires-Dist: build; extra == "dev"
43
+ Requires-Dist: types-requests; extra == "dev"
44
+ Requires-Dist: types-beautifulsoup4; extra == "dev"
45
+ Dynamic: license-file
46
+
47
+ # ietf-notebook
48
+
49
+ Automate gathering of [NotebookLM](https://notebooklm.google.com/)-ready documents for an [IETF](https://www.ietf.org/) Working Group.
50
+
51
+ This tool gathers Working Group charters, drafts, meeting minutes, PDF slides, mailing list archives, and GitHub issues into a set of clean text files and PDFs suitable for ingestion into NotebookLM.
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pipx install ietf-notebook
57
+ ```
58
+
59
+ ## Usage
60
+
61
+ ```bash
62
+ ietf-notebook [wg_shortname] _OPTIONS_
63
+ ```
64
+
65
+ ### Options
66
+
67
+ - `wg_shortname`: IETF Working Group short name (e.g., `httpbis`).
68
+ - `--destination`: Folder to save files in (default: current directory).
69
+ - `--github`: GitHub org/repo for issues (e.g., `ietf-wg-httpbis/wg-materials`).
70
+ - `--months`: Number of months of mailing list history to fetch (default: all).
71
+ - `--force`: Force re-downloading of existing files. By default, the tool skips files that already exist in the destination.
72
+ - `--quiet`: No messages except for errors and the final resource summary.
73
+ - `--verbose`: Detailed progress reporting.
74
+
75
+ ### Default Behavior
76
+
77
+ - **Charters, Meetings, and Mbox**: Existing files are skipped unless `--force` is used.
78
+ - **Mailing List Discovery**: The tool automatically finds the mailing list for the WG from the Datatracker.
79
+ - **IMAP Retrieval**: Mailing list archives are fetched via IMAP from `imap.ietf.org` and cached locally in `.imap-cache/`.
80
+ - **GitHub Strategy**: The tool first checks for `archive.json` on the `gh-pages` branch (common in repos using [Martin Thomson's template](https://github.com/martinthomson/internet-draft-template)).
81
+ - **GitHub Auth**: To avoid rate limits when fetching from the API, set the `GITHUB_TOKEN` environment variable.
82
+
83
+ ## Contributing
84
+
85
+ Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
@@ -0,0 +1,39 @@
1
+ # ietf-notebook
2
+
3
+ Automate gathering of [NotebookLM](https://notebooklm.google.com/)-ready documents for an [IETF](https://www.ietf.org/) Working Group.
4
+
5
+ This tool gathers Working Group charters, drafts, meeting minutes, PDF slides, mailing list archives, and GitHub issues into a set of clean text files and PDFs suitable for ingestion into NotebookLM.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pipx install ietf-notebook
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```bash
16
+ ietf-notebook [wg_shortname] _OPTIONS_
17
+ ```
18
+
19
+ ### Options
20
+
21
+ - `wg_shortname`: IETF Working Group short name (e.g., `httpbis`).
22
+ - `--destination`: Folder to save files in (default: current directory).
23
+ - `--github`: GitHub org/repo for issues (e.g., `ietf-wg-httpbis/wg-materials`).
24
+ - `--months`: Number of months of mailing list history to fetch (default: all).
25
+ - `--force`: Force re-downloading of existing files. By default, the tool skips files that already exist in the destination.
26
+ - `--quiet`: No messages except for errors and the final resource summary.
27
+ - `--verbose`: Detailed progress reporting.
28
+
29
+ ### Default Behavior
30
+
31
+ - **Charters, Meetings, and Mbox**: Existing files are skipped unless `--force` is used.
32
+ - **Mailing List Discovery**: The tool automatically finds the mailing list for the WG from the Datatracker.
33
+ - **IMAP Retrieval**: Mailing list archives are fetched via IMAP from `imap.ietf.org` and cached locally in `.imap-cache/`.
34
+ - **GitHub Strategy**: The tool first checks for `archive.json` on the `gh-pages` branch (common in repos using [Martin Thomson's template](https://github.com/martinthomson/internet-draft-template)).
35
+ - **GitHub Auth**: To avoid rate limits when fetching from the API, set the `GITHUB_TOKEN` environment variable.
36
+
37
+ ## Contributing
38
+
39
+ Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,120 @@
1
+ import argparse
2
+ import os
3
+ from .mbox import sync_mailing_list
4
+ from .github import download_github_issues, process_github_issues
5
+ from .meetings import process_meetings
6
+ from .charter import process_charter
7
+ from .drafts import process_drafts
8
+ from .utils import Verbosity, LogLevel, log
9
+
10
+
11
+ def main() -> None:
12
+ parser = argparse.ArgumentParser(
13
+ description="Automate creation of NotebookLM-ready documents for an IETF Working Group."
14
+ )
15
+ parser.add_argument("wg", help="IETF Working Group short name (e.g., 'httpbis')")
16
+ parser.add_argument(
17
+ "--github", help="GitHub owner/repo (e.g., 'ietf-wg-httpbis/wg-materials')"
18
+ )
19
+ parser.add_argument(
20
+ "--months",
21
+ type=int,
22
+ default=None,
23
+ help="Number of months of mailing list archives to fetch (default: all)",
24
+ )
25
+ parser.add_argument(
26
+ "--destination",
27
+ default=".",
28
+ help="Destination folder (default: current directory)",
29
+ )
30
+ parser.add_argument(
31
+ "--force",
32
+ action="store_true",
33
+ help="Overwrite resources that already exist in the destination",
34
+ )
35
+ parser.add_argument("--quiet", "-q", action="store_true", help="Only output errors")
36
+ parser.add_argument(
37
+ "--verbose", "-v", action="store_true", help="Detailed progress reporting"
38
+ )
39
+
40
+ args = parser.parse_args()
41
+
42
+ if not os.path.exists(args.destination):
43
+ os.makedirs(args.destination)
44
+
45
+ verbosity = Verbosity.STATUS
46
+ if args.quiet:
47
+ verbosity = Verbosity.QUIET
48
+ elif args.verbose:
49
+ verbosity = Verbosity.VERBOSE
50
+
51
+ if verbosity != Verbosity.QUIET:
52
+ print(f"Processing WG: {args.wg}")
53
+ print(f"Destination: {args.destination}")
54
+ if args.force:
55
+ print("Force mode: overwriting existing files.")
56
+ else:
57
+ print("Default mode: skipping existing files (except GitHub issues).")
58
+ print("-" * 40)
59
+
60
+ results = []
61
+
62
+ # 1. Charter
63
+ charter_file = os.path.join(args.destination, f"{args.wg}-charter.txt")
64
+ if not args.force and os.path.exists(charter_file):
65
+ log(
66
+ f"Skipping charter: {charter_file} already exists.",
67
+ verbosity,
68
+ level=LogLevel.PROGRESS,
69
+ )
70
+ else:
71
+ results.extend(process_charter(args.wg, charter_file, verbose=verbosity))
72
+
73
+ # 2. Meetings
74
+ results.extend(
75
+ process_meetings(args.wg, args.destination, force=args.force, verbose=verbosity)
76
+ )
77
+
78
+ # 3. Mailing List
79
+ results.extend(
80
+ sync_mailing_list(
81
+ args.wg, args.destination, months=args.months, verbose=verbosity
82
+ )
83
+ )
84
+
85
+ # 4. Drafts
86
+ results.extend(
87
+ process_drafts(args.wg, args.destination, force=args.force, verbose=verbosity)
88
+ )
89
+
90
+ # 5. GitHub Issues
91
+ if args.github:
92
+ gh_json = os.path.join(args.destination, f"{args.wg}-github-issues.json")
93
+ gh_txt = os.path.join(args.destination, f"{args.wg}-github-issues.txt")
94
+
95
+ if download_github_issues(args.github, gh_json, verbose=verbosity):
96
+ results.extend(process_github_issues(gh_json, gh_txt, verbose=verbosity))
97
+ try:
98
+ os.remove(gh_json)
99
+ except OSError as err:
100
+ log(f"Error cleaning up {gh_json}: {err}", verbosity, level=LogLevel.ERROR)
101
+ else:
102
+ log(
103
+ "Skip GitHub issues: no GitHub repo provided.",
104
+ verbosity,
105
+ level=LogLevel.PROGRESS,
106
+ )
107
+
108
+ if verbosity != Verbosity.QUIET:
109
+ print("-" * 40)
110
+ print("All tasks completed.")
111
+
112
+ if results:
113
+ print("\n## Updated Resources")
114
+ for res in sorted(list(set(results))):
115
+ rel_path = os.path.relpath(res, os.getcwd())
116
+ print(f"- {rel_path}")
117
+
118
+
119
+ if __name__ == "__main__":
120
+ main()
@@ -0,0 +1,64 @@
1
+ from typing import List
2
+ from bs4 import BeautifulSoup
3
+ from .utils import Verbosity, LogLevel, clean_html, fetch_resource, log
4
+
5
+
6
+ def process_charter(
7
+ wg_name: str, output_file: str, verbose: Verbosity = Verbosity.STATUS
8
+ ) -> List[str]:
9
+ """Fetch the WG charter and write to output_file. Returns list of updated files."""
10
+ url = f"https://datatracker.ietf.org/doc/charter-ietf-{wg_name}/"
11
+ log(f"Fetching charter for {wg_name}...", verbose, level=LogLevel.STATUS)
12
+
13
+ # Try fetching as markdown first
14
+ res = fetch_resource(url, headers={"Accept": "text/markdown"})
15
+ if not res:
16
+ log(f"Error: Could not fetch charter from {url}", verbose, level=LogLevel.ERROR)
17
+ return []
18
+
19
+ charter_text = ""
20
+ if "text/markdown" in res.headers.get("Content-Type", ""):
21
+ charter_text = res.text
22
+ else:
23
+ # Fallback to HTML cleaning
24
+ html = res.text
25
+ bs_soup = BeautifulSoup(html, "html.parser")
26
+
27
+ # The charter text is usually in a div with class 'card-body' on the datatracker.
28
+ charter_div = bs_soup.find("div", class_="card-body")
29
+
30
+ if not charter_div:
31
+ # Fallback to charter-text or similar
32
+ charter_div = bs_soup.find("div", class_="charter-text")
33
+
34
+ if not charter_div:
35
+ # Fallback to looking for the "Charter" heading
36
+ heading = None
37
+ for h2 in bs_soup.find_all("h2"):
38
+ if h2.string and "Charter" in h2.string:
39
+ heading = h2
40
+ break
41
+ if heading:
42
+ charter_div = heading.find_next("div")
43
+
44
+ if charter_div:
45
+ charter_text = clean_html(str(charter_div))
46
+ else:
47
+ # Last resort: clean the whole page but it might be noisy
48
+ log(
49
+ "Warning: Could not isolate charter text, cleaning entire page.",
50
+ verbose,
51
+ level=LogLevel.PROGRESS,
52
+ )
53
+ charter_text = clean_html(html)
54
+
55
+ if charter_text:
56
+ with open(output_file, "w", encoding="utf-8") as out_fh:
57
+ out_fh.write(f"Working Group Charter: {wg_name}\n")
58
+ out_fh.write(f"Source: {url}\n")
59
+ out_fh.write("=" * 80 + "\n\n")
60
+ out_fh.write(charter_text + "\n")
61
+
62
+ log(f"Done! Charter written to {output_file}.", verbose, level=LogLevel.STATUS)
63
+ return [output_file]
64
+ return []
@@ -0,0 +1,94 @@
1
+ import os
2
+ import re
3
+ from typing import List, Dict, Any
4
+ from bs4 import BeautifulSoup
5
+ from .utils import LogLevel, Verbosity, log, fetch_resource
6
+
7
+
8
+ def get_adopted_drafts(
9
+ wg_name: str, verbose: Verbosity = Verbosity.STATUS
10
+ ) -> List[Dict[str, Any]]:
11
+ """Scrape WG documents page for active drafts."""
12
+ url = f"https://datatracker.ietf.org/wg/{wg_name}/documents/"
13
+ log(f"Finding adopted drafts for {wg_name}...", verbose, level=LogLevel.STATUS)
14
+ res = fetch_resource(url)
15
+ if not res:
16
+ return []
17
+
18
+ soup = BeautifulSoup(res.text, "html.parser")
19
+ drafts = []
20
+
21
+ # Strategy: look for links starting with /doc/draft-ietf-{wg_name}-
22
+ # The link text contains the draft name + version
23
+ pattern = f"/doc/draft-ietf-{wg_name}-"
24
+ for a_tag in soup.find_all("a", href=True):
25
+ href = a_tag.get("href")
26
+ if not isinstance(href, str) or not href.startswith(pattern):
27
+ continue
28
+
29
+ text = a_tag.get_text(strip=True)
30
+ # Text usually looks like "draft-ietf-aipref-vocab-05"
31
+ if text.startswith(f"draft-ietf-{wg_name}-"):
32
+ # Use regex to find "draft-..." + extension
33
+ match = re.search(
34
+ r"(draft-ietf-" + re.escape(wg_name) + r"-.*?)-(\d+)$", text
35
+ )
36
+ if match:
37
+ draft_name = match.group(1)
38
+ try:
39
+ current_rev = int(match.group(2))
40
+ drafts.append({"name": draft_name, "max_rev": current_rev})
41
+ except ValueError:
42
+ continue
43
+
44
+ # De-duplicate and keep the highest revision found
45
+ unique_drafts: Dict[str, int] = {}
46
+ for draft_entry in drafts:
47
+ name = str(draft_entry["name"])
48
+ rev = int(draft_entry["max_rev"])
49
+ if name not in unique_drafts or rev > unique_drafts[name]:
50
+ unique_drafts[name] = rev
51
+
52
+ result = [{"name": name, "max_rev": rev} for name, rev in unique_drafts.items()]
53
+ return result
54
+
55
+
56
+ def process_drafts(
57
+ wg_name: str,
58
+ destination: str,
59
+ force: bool = False,
60
+ verbose: Verbosity = Verbosity.STATUS,
61
+ ) -> List[str]:
62
+ """Download all revisions of WG drafts as text."""
63
+ updated = []
64
+ drafts = get_adopted_drafts(wg_name, verbose)
65
+ if not drafts:
66
+ log(f"No adopted drafts found for {wg_name}.", verbose, level=LogLevel.STATUS)
67
+ return []
68
+
69
+ for draft in drafts:
70
+ name = draft["name"]
71
+ max_rev = draft["max_rev"]
72
+ log(
73
+ f"Processing draft: {name} (revs 00 to {max_rev:02d})",
74
+ verbose,
75
+ level=LogLevel.STATUS,
76
+ )
77
+
78
+ for rev in range(max_rev + 1):
79
+ rev_str = f"{rev:02d}"
80
+ filename = f"{name}-{rev_str}.txt"
81
+ filepath = os.path.join(destination, filename)
82
+
83
+ if not force and os.path.exists(filepath):
84
+ continue
85
+
86
+ url = f"https://www.ietf.org/archive/id/{name}-{rev_str}.txt"
87
+ log(f"Downloading {filename}...", verbose, level=LogLevel.PROGRESS)
88
+ res = fetch_resource(url)
89
+ if res:
90
+ with open(filepath, "w", encoding="utf-8") as out_fh:
91
+ out_fh.write(str(res.text))
92
+ updated.append(filepath)
93
+
94
+ return updated
@@ -0,0 +1,242 @@
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from typing import Optional, List, Dict, Any
5
+ import requests
6
+ from .utils import LogLevel, Verbosity, log
7
+
8
+
9
+ def format_date(iso_date: Optional[str]) -> str:
10
+ """Convert ISO date to a more readable format."""
11
+ if not iso_date:
12
+ return "(Unknown Date)"
13
+ try:
14
+ dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00"))
15
+ return dt.strftime("%Y-%m-%d %H:%M:%S %Z")
16
+ except (ValueError, TypeError):
17
+ return iso_date
18
+
19
+
20
+ def process_github_issues(
21
+ input_file: str, output_file: str, verbose: Verbosity = Verbosity.STATUS
22
+ ) -> List[str]:
23
+ """Process a GitHub issues JSON archive and write cleaned text to output_file."""
24
+ log(f"Opening {input_file}...", verbose, level=LogLevel.PROGRESS)
25
+ try:
26
+ with open(input_file, "r", encoding="utf-8") as json_fh:
27
+ data = json.load(json_fh)
28
+ except (json.JSONDecodeError, OSError) as err:
29
+ log(f"Error parsing GitHub JSON: {err}", verbose, level=LogLevel.ERROR)
30
+ return []
31
+
32
+ issues = data.get("issues", [])
33
+ repo_name = data.get("repo", "Unknown Repo")
34
+
35
+ with open(output_file, "w", encoding="utf-8") as out_fh:
36
+ out_fh.write(f"Repository: {repo_name}\n")
37
+ out_fh.write(f"Archive Export Date: {format_date(data.get('timestamp'))}\n")
38
+ out_fh.write("=" * 80 + "\n\n")
39
+
40
+ for issue in issues:
41
+ title = issue.get("title", "(No Title)")
42
+ number = issue.get("number", "?")
43
+ state = issue.get("state", "(Unknown State)")
44
+ author = issue.get("author", "(Unknown Author)")
45
+ created_at = format_date(issue.get("createdAt"))
46
+ labels = ", ".join(issue.get("labels", []))
47
+ body = (issue.get("body") or "").strip()
48
+
49
+ out_fh.write(f"Issue #{number}: {title}\n")
50
+ out_fh.write(f"State: {state}\n")
51
+ out_fh.write(f"Date: {created_at}\n")
52
+ out_fh.write(f"Author: {author}\n")
53
+ if labels:
54
+ out_fh.write(f"Labels: {labels}\n")
55
+ out_fh.write("\n")
56
+
57
+ out_fh.write((body or "(No description provided)") + "\n")
58
+
59
+ comments = issue.get("comments", [])
60
+ if comments:
61
+ out_fh.write("\n" + "-" * 40 + "\n")
62
+ out_fh.write(f"Comments ({len(comments)}):\n\n")
63
+ for comment in comments:
64
+ c_author = comment.get("author", "(Unknown)")
65
+ c_date = format_date(comment.get("createdAt"))
66
+ c_body = (comment.get("body") or "").strip()
67
+
68
+ out_fh.write(f"--- Comment by {c_author} on {c_date} ---\n")
69
+ out_fh.write(c_body + "\n\n")
70
+
71
+ out_fh.write("=" * 80 + "\n\n")
72
+
73
+ log(
74
+ f"Done! Extracted {len(issues)} issues to {output_file}.",
75
+ verbose,
76
+ level=LogLevel.STATUS,
77
+ )
78
+ return [output_file]
79
+
80
+
81
+ def download_github_issues(
82
+ repo_short: str,
83
+ dest_path: str,
84
+ token: Optional[str] = None,
85
+ verbose: Verbosity = Verbosity.STATUS,
86
+ ) -> bool:
87
+ """Download GitHub issues JSON using the API from 'owner/repo' short name."""
88
+ if repo_short.startswith("http"):
89
+ log(
90
+ f"Direct downloading GitHub issues from {repo_short}...",
91
+ verbose,
92
+ level=LogLevel.STATUS,
93
+ )
94
+ try:
95
+ response = requests.get(repo_short, timeout=60)
96
+ response.raise_for_status()
97
+ with open(dest_path, "w", encoding="utf-8") as json_file:
98
+ json_file.write(response.text)
99
+ return True
100
+ except (requests.RequestException, OSError) as err:
101
+ log(
102
+ f"Error downloading GitHub issues: {err}", verbose, level=LogLevel.ERROR
103
+ )
104
+ return False
105
+
106
+ # Expecting owner/repo
107
+ if "/" not in repo_short:
108
+ log(
109
+ f"Invalid GitHub short name: {repo_short}. Expected 'owner/repo'.",
110
+ verbose,
111
+ level=LogLevel.ERROR,
112
+ )
113
+ return False
114
+ owner, repo = repo_short.split("/", 1)
115
+ archive_url = f"https://{owner}.github.io/{repo}/archive.json"
116
+
117
+ log(
118
+ f"Checking for GitHub archive at {archive_url}...",
119
+ verbose,
120
+ level=LogLevel.STATUS,
121
+ )
122
+ try:
123
+ response = requests.get(archive_url, timeout=30)
124
+ if response.status_code == 200:
125
+ log("Archive found; downloading...", verbose, level=LogLevel.STATUS)
126
+ try:
127
+ archive_data = response.json()
128
+ # Ensure it's in our expected format (dict with 'issues' key)
129
+ if isinstance(archive_data, list):
130
+ archive_data = {
131
+ "repo": f"{owner}/{repo}",
132
+ "timestamp": datetime.now().isoformat(),
133
+ "issues": archive_data,
134
+ }
135
+ elif "issues" not in archive_data:
136
+ # If it's a dict but missing 'issues', we might still want to wrap it
137
+ # or handle it differently. For now, assume it might be a single issue
138
+ # or some other format and wrap if it's not our expected schema.
139
+ archive_data = {
140
+ "repo": f"{owner}/{repo}",
141
+ "timestamp": datetime.now().isoformat(),
142
+ "issues": [archive_data],
143
+ }
144
+ with open(dest_path, "w", encoding="utf-8") as json_fh:
145
+ json.dump(archive_data, json_fh, indent=2)
146
+ return True
147
+ except (json.JSONDecodeError, TypeError) as err:
148
+ log(f"Error parsing archive JSON: {err}", Verbosity.VERBOSE, level=LogLevel.STATUS)
149
+ log("No archive found on gh-pages.", verbose, level=LogLevel.PROGRESS)
150
+ except (requests.RequestException, OSError) as err:
151
+ log(
152
+ f"Error checking gh-pages archive: {err}",
153
+ verbose,
154
+ level=LogLevel.STATUS,
155
+ )
156
+
157
+ log(
158
+ f"Fetching GitHub issues via API for {owner}/{repo}...",
159
+ verbose,
160
+ level=LogLevel.STATUS,
161
+ )
162
+ headers = {"Accept": "application/vnd.github.v3+json"}
163
+ github_token = token or os.environ.get("GITHUB_TOKEN")
164
+ if github_token:
165
+ headers["Authorization"] = f"token {github_token}"
166
+
167
+ try:
168
+ all_issues = _fetch_all_issues(owner, repo, headers, verbose)
169
+ export_data = {
170
+ "repo": f"{owner}/{repo}",
171
+ "timestamp": datetime.now().isoformat(),
172
+ "issues": all_issues,
173
+ }
174
+ with open(dest_path, "w", encoding="utf-8") as json_fh:
175
+ json.dump(export_data, json_fh, indent=2)
176
+ return True
177
+ except (requests.RequestException, OSError) as err:
178
+ log(f"Error fetching GitHub issues: {err}", verbose, level=LogLevel.ERROR)
179
+ return False
180
+
181
+
182
+ def _fetch_all_issues(
183
+ owner: str, repo_name: str, headers: Dict[str, str], verbose: Verbosity
184
+ ) -> List[Dict[str, Any]]:
185
+ """Fetch all issues and their comments from GitHub API."""
186
+ all_issues = []
187
+ page = 1
188
+ while True:
189
+ api_url = (
190
+ f"https://api.github.com/repos/{owner}/{repo_name}/issues"
191
+ f"?state=all&page={page}&per_page=100"
192
+ )
193
+ res = requests.get(api_url, headers=headers, timeout=60)
194
+ res.raise_for_status()
195
+ issues = res.json()
196
+ if not issues:
197
+ break
198
+
199
+ for issue in issues:
200
+ # GitHub API returns both issues and PRs (PRs have a 'pull_request' key)
201
+ if "pull_request" in issue:
202
+ continue
203
+
204
+ issue_data = {
205
+ "number": issue.get("number"),
206
+ "title": issue.get("title"),
207
+ "state": issue.get("state"),
208
+ "author": issue.get("user", {}).get("login"),
209
+ "createdAt": issue.get("created_at"),
210
+ "labels": [l.get("name") for l in issue.get("labels", [])],
211
+ "body": issue.get("body"),
212
+ "comments": [],
213
+ }
214
+ if issue.get("comments", 0) > 0:
215
+ issue_data["comments"] = _fetch_issue_comments(
216
+ issue.get("comments_url"), headers
217
+ )
218
+
219
+ all_issues.append(issue_data)
220
+
221
+ page += 1
222
+ if len(issues) < 100:
223
+ break
224
+ return all_issues
225
+
226
+
227
+ def _fetch_issue_comments(
228
+ comments_url: str, headers: Dict[str, str]
229
+ ) -> List[Dict[str, Any]]:
230
+ """Fetch comments for a specific issue."""
231
+ c_res = requests.get(comments_url, headers=headers, timeout=30)
232
+ if c_res.status_code == 200:
233
+ comments = c_res.json()
234
+ return [
235
+ {
236
+ "author": comment.get("user", {}).get("login"),
237
+ "createdAt": comment.get("created_at"),
238
+ "body": comment.get("body"),
239
+ }
240
+ for comment in comments
241
+ ]
242
+ return []