ietf-notebook 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ietf_notebook-0.1.0/LICENSE.md +19 -0
- ietf_notebook-0.1.0/MANIFEST.in +1 -0
- ietf_notebook-0.1.0/PKG-INFO +85 -0
- ietf_notebook-0.1.0/README.md +39 -0
- ietf_notebook-0.1.0/ietf_notebook/__init__.py +1 -0
- ietf_notebook-0.1.0/ietf_notebook/__main__.py +120 -0
- ietf_notebook-0.1.0/ietf_notebook/charter.py +64 -0
- ietf_notebook-0.1.0/ietf_notebook/drafts.py +94 -0
- ietf_notebook-0.1.0/ietf_notebook/github.py +242 -0
- ietf_notebook-0.1.0/ietf_notebook/mbox.py +223 -0
- ietf_notebook-0.1.0/ietf_notebook/meetings.py +247 -0
- ietf_notebook-0.1.0/ietf_notebook/py.typed +0 -0
- ietf_notebook-0.1.0/ietf_notebook/utils.py +180 -0
- ietf_notebook-0.1.0/ietf_notebook.egg-info/PKG-INFO +85 -0
- ietf_notebook-0.1.0/ietf_notebook.egg-info/SOURCES.txt +19 -0
- ietf_notebook-0.1.0/ietf_notebook.egg-info/dependency_links.txt +1 -0
- ietf_notebook-0.1.0/ietf_notebook.egg-info/entry_points.txt +2 -0
- ietf_notebook-0.1.0/ietf_notebook.egg-info/requires.txt +13 -0
- ietf_notebook-0.1.0/ietf_notebook.egg-info/top_level.txt +2 -0
- ietf_notebook-0.1.0/pyproject.toml +101 -0
- ietf_notebook-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) Mark Nottingham
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
|
11
|
+
all copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
19
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include ietf_notebook/py.typed
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ietf-notebook
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Gather resources for an IETF Working Group for use in NotebookLM
|
|
5
|
+
Author-email: Mark Nottingham <mnot@mnot.net>
|
|
6
|
+
License: Copyright (c) Mark Nottingham
|
|
7
|
+
|
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
9
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
10
|
+
in the Software without restriction, including without limitation the rights
|
|
11
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
12
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
13
|
+
furnished to do so, subject to the following conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice shall be included in
|
|
16
|
+
all copies or substantial portions of the Software.
|
|
17
|
+
|
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
24
|
+
THE SOFTWARE.
|
|
25
|
+
|
|
26
|
+
Project-URL: homepage, https://github.com/mnot/ietf-notebook
|
|
27
|
+
Classifier: Operating System :: OS Independent
|
|
28
|
+
Classifier: Development Status :: 4 - Beta
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Requires-Python: >=3.9
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE.md
|
|
33
|
+
Requires-Dist: requests
|
|
34
|
+
Requires-Dist: beautifulsoup4
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: mypy; extra == "dev"
|
|
37
|
+
Requires-Dist: black; extra == "dev"
|
|
38
|
+
Requires-Dist: pylint; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-md; extra == "dev"
|
|
41
|
+
Requires-Dist: validate-pyproject; extra == "dev"
|
|
42
|
+
Requires-Dist: build; extra == "dev"
|
|
43
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
44
|
+
Requires-Dist: types-beautifulsoup4; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# ietf-notebook
|
|
48
|
+
|
|
49
|
+
Automate gathering of [NotebookLM](https://notebooklm.google.com/)-ready documents for an [IETF](https://www.ietf.org/) Working Group.
|
|
50
|
+
|
|
51
|
+
This tool gathers Working Group charters, drafts, meeting minutes, PDF slides, mailing list archives, and GitHub issues into a set of clean text files and PDFs suitable for ingestion into NotebookLM.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pipx install ietf-notebook
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Usage
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
ietf-notebook [wg_shortname] _OPTIONS_
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Options
|
|
66
|
+
|
|
67
|
+
- `wg_shortname`: IETF Working Group short name (e.g., `httpbis`).
|
|
68
|
+
- `--destination`: Folder to save files in (default: current directory).
|
|
69
|
+
- `--github`: GitHub org/repo for issues (e.g., `ietf-wg-httpbis/wg-materials`).
|
|
70
|
+
- `--months`: Number of months of mailing list history to fetch (default: all).
|
|
71
|
+
- `--force`: Force re-downloading of existing files. By default, the tool skips files that already exist in the destination.
|
|
72
|
+
- `--quiet`: No messages except for errors and the final resource summary.
|
|
73
|
+
- `--verbose`: Detailed progress reporting.
|
|
74
|
+
|
|
75
|
+
### Default Behavior
|
|
76
|
+
|
|
77
|
+
- **Charters, Meetings, and Mbox**: Existing files are skipped unless `--force` is used.
|
|
78
|
+
- **Mailing List Discovery**: The tool automatically finds the mailing list for the WG from the Datatracker.
|
|
79
|
+
- **IMAP Retrieval**: Mailing list archives are fetched via IMAP from `imap.ietf.org` and cached locally in `.imap-cache/`.
|
|
80
|
+
- **GitHub Strategy**: The tool first checks for `archive.json` on the `gh-pages` branch (common in repos using [Martin Thomson's template](https://github.com/martinthomson/internet-draft-template)).
|
|
81
|
+
- **GitHub Auth**: To avoid rate limits when fetching from the API, set the `GITHUB_TOKEN` environment variable.
|
|
82
|
+
|
|
83
|
+
## Contributing
|
|
84
|
+
|
|
85
|
+
Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# ietf-notebook
|
|
2
|
+
|
|
3
|
+
Automate gathering of [NotebookLM](https://notebooklm.google.com/)-ready documents for an [IETF](https://www.ietf.org/) Working Group.
|
|
4
|
+
|
|
5
|
+
This tool gathers Working Group charters, drafts, meeting minutes, PDF slides, mailing list archives, and GitHub issues into a set of clean text files and PDFs suitable for ingestion into NotebookLM.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pipx install ietf-notebook
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
ietf-notebook [wg_shortname] _OPTIONS_
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Options
|
|
20
|
+
|
|
21
|
+
- `wg_shortname`: IETF Working Group short name (e.g., `httpbis`).
|
|
22
|
+
- `--destination`: Folder to save files in (default: current directory).
|
|
23
|
+
- `--github`: GitHub org/repo for issues (e.g., `ietf-wg-httpbis/wg-materials`).
|
|
24
|
+
- `--months`: Number of months of mailing list history to fetch (default: all).
|
|
25
|
+
- `--force`: Force re-downloading of existing files. By default, the tool skips files that already exist in the destination.
|
|
26
|
+
- `--quiet`: No messages except for errors and the final resource summary.
|
|
27
|
+
- `--verbose`: Detailed progress reporting.
|
|
28
|
+
|
|
29
|
+
### Default Behavior
|
|
30
|
+
|
|
31
|
+
- **Charters, Meetings, and Mbox**: Existing files are skipped unless `--force` is used.
|
|
32
|
+
- **Mailing List Discovery**: The tool automatically finds the mailing list for the WG from the Datatracker.
|
|
33
|
+
- **IMAP Retrieval**: Mailing list archives are fetched via IMAP from `imap.ietf.org` and cached locally in `.imap-cache/`.
|
|
34
|
+
- **GitHub Strategy**: The tool first checks for `archive.json` on the `gh-pages` branch (common in repos using [Martin Thomson's template](https://github.com/martinthomson/internet-draft-template)).
|
|
35
|
+
- **GitHub Auth**: To avoid rate limits when fetching from the API, set the `GITHUB_TOKEN` environment variable.
|
|
36
|
+
|
|
37
|
+
## Contributing
|
|
38
|
+
|
|
39
|
+
Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
from .mbox import sync_mailing_list
|
|
4
|
+
from .github import download_github_issues, process_github_issues
|
|
5
|
+
from .meetings import process_meetings
|
|
6
|
+
from .charter import process_charter
|
|
7
|
+
from .drafts import process_drafts
|
|
8
|
+
from .utils import Verbosity, LogLevel, log
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main() -> None:
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
description="Automate creation of NotebookLM-ready documents for an IETF Working Group."
|
|
14
|
+
)
|
|
15
|
+
parser.add_argument("wg", help="IETF Working Group short name (e.g., 'httpbis')")
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"--github", help="GitHub owner/repo (e.g., 'ietf-wg-httpbis/wg-materials')"
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"--months",
|
|
21
|
+
type=int,
|
|
22
|
+
default=None,
|
|
23
|
+
help="Number of months of mailing list archives to fetch (default: all)",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--destination",
|
|
27
|
+
default=".",
|
|
28
|
+
help="Destination folder (default: current directory)",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--force",
|
|
32
|
+
action="store_true",
|
|
33
|
+
help="Overwrite resources that already exist in the destination",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument("--quiet", "-q", action="store_true", help="Only output errors")
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--verbose", "-v", action="store_true", help="Detailed progress reporting"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
args = parser.parse_args()
|
|
41
|
+
|
|
42
|
+
if not os.path.exists(args.destination):
|
|
43
|
+
os.makedirs(args.destination)
|
|
44
|
+
|
|
45
|
+
verbosity = Verbosity.STATUS
|
|
46
|
+
if args.quiet:
|
|
47
|
+
verbosity = Verbosity.QUIET
|
|
48
|
+
elif args.verbose:
|
|
49
|
+
verbosity = Verbosity.VERBOSE
|
|
50
|
+
|
|
51
|
+
if verbosity != Verbosity.QUIET:
|
|
52
|
+
print(f"Processing WG: {args.wg}")
|
|
53
|
+
print(f"Destination: {args.destination}")
|
|
54
|
+
if args.force:
|
|
55
|
+
print("Force mode: overwriting existing files.")
|
|
56
|
+
else:
|
|
57
|
+
print("Default mode: skipping existing files (except GitHub issues).")
|
|
58
|
+
print("-" * 40)
|
|
59
|
+
|
|
60
|
+
results = []
|
|
61
|
+
|
|
62
|
+
# 1. Charter
|
|
63
|
+
charter_file = os.path.join(args.destination, f"{args.wg}-charter.txt")
|
|
64
|
+
if not args.force and os.path.exists(charter_file):
|
|
65
|
+
log(
|
|
66
|
+
f"Skipping charter: {charter_file} already exists.",
|
|
67
|
+
verbosity,
|
|
68
|
+
level=LogLevel.PROGRESS,
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
results.extend(process_charter(args.wg, charter_file, verbose=verbosity))
|
|
72
|
+
|
|
73
|
+
# 2. Meetings
|
|
74
|
+
results.extend(
|
|
75
|
+
process_meetings(args.wg, args.destination, force=args.force, verbose=verbosity)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# 3. Mailing List
|
|
79
|
+
results.extend(
|
|
80
|
+
sync_mailing_list(
|
|
81
|
+
args.wg, args.destination, months=args.months, verbose=verbosity
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# 4. Drafts
|
|
86
|
+
results.extend(
|
|
87
|
+
process_drafts(args.wg, args.destination, force=args.force, verbose=verbosity)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# 5. GitHub Issues
|
|
91
|
+
if args.github:
|
|
92
|
+
gh_json = os.path.join(args.destination, f"{args.wg}-github-issues.json")
|
|
93
|
+
gh_txt = os.path.join(args.destination, f"{args.wg}-github-issues.txt")
|
|
94
|
+
|
|
95
|
+
if download_github_issues(args.github, gh_json, verbose=verbosity):
|
|
96
|
+
results.extend(process_github_issues(gh_json, gh_txt, verbose=verbosity))
|
|
97
|
+
try:
|
|
98
|
+
os.remove(gh_json)
|
|
99
|
+
except OSError as err:
|
|
100
|
+
log(f"Error cleaning up {gh_json}: {err}", verbosity, level=LogLevel.ERROR)
|
|
101
|
+
else:
|
|
102
|
+
log(
|
|
103
|
+
"Skip GitHub issues: no GitHub repo provided.",
|
|
104
|
+
verbosity,
|
|
105
|
+
level=LogLevel.PROGRESS,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if verbosity != Verbosity.QUIET:
|
|
109
|
+
print("-" * 40)
|
|
110
|
+
print("All tasks completed.")
|
|
111
|
+
|
|
112
|
+
if results:
|
|
113
|
+
print("\n## Updated Resources")
|
|
114
|
+
for res in sorted(list(set(results))):
|
|
115
|
+
rel_path = os.path.relpath(res, os.getcwd())
|
|
116
|
+
print(f"- {rel_path}")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
main()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
from .utils import Verbosity, LogLevel, clean_html, fetch_resource, log
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def process_charter(
|
|
7
|
+
wg_name: str, output_file: str, verbose: Verbosity = Verbosity.STATUS
|
|
8
|
+
) -> List[str]:
|
|
9
|
+
"""Fetch the WG charter and write to output_file. Returns list of updated files."""
|
|
10
|
+
url = f"https://datatracker.ietf.org/doc/charter-ietf-{wg_name}/"
|
|
11
|
+
log(f"Fetching charter for {wg_name}...", verbose, level=LogLevel.STATUS)
|
|
12
|
+
|
|
13
|
+
# Try fetching as markdown first
|
|
14
|
+
res = fetch_resource(url, headers={"Accept": "text/markdown"})
|
|
15
|
+
if not res:
|
|
16
|
+
log(f"Error: Could not fetch charter from {url}", verbose, level=LogLevel.ERROR)
|
|
17
|
+
return []
|
|
18
|
+
|
|
19
|
+
charter_text = ""
|
|
20
|
+
if "text/markdown" in res.headers.get("Content-Type", ""):
|
|
21
|
+
charter_text = res.text
|
|
22
|
+
else:
|
|
23
|
+
# Fallback to HTML cleaning
|
|
24
|
+
html = res.text
|
|
25
|
+
bs_soup = BeautifulSoup(html, "html.parser")
|
|
26
|
+
|
|
27
|
+
# The charter text is usually in a div with class 'card-body' on the datatracker.
|
|
28
|
+
charter_div = bs_soup.find("div", class_="card-body")
|
|
29
|
+
|
|
30
|
+
if not charter_div:
|
|
31
|
+
# Fallback to charter-text or similar
|
|
32
|
+
charter_div = bs_soup.find("div", class_="charter-text")
|
|
33
|
+
|
|
34
|
+
if not charter_div:
|
|
35
|
+
# Fallback to looking for the "Charter" heading
|
|
36
|
+
heading = None
|
|
37
|
+
for h2 in bs_soup.find_all("h2"):
|
|
38
|
+
if h2.string and "Charter" in h2.string:
|
|
39
|
+
heading = h2
|
|
40
|
+
break
|
|
41
|
+
if heading:
|
|
42
|
+
charter_div = heading.find_next("div")
|
|
43
|
+
|
|
44
|
+
if charter_div:
|
|
45
|
+
charter_text = clean_html(str(charter_div))
|
|
46
|
+
else:
|
|
47
|
+
# Last resort: clean the whole page but it might be noisy
|
|
48
|
+
log(
|
|
49
|
+
"Warning: Could not isolate charter text, cleaning entire page.",
|
|
50
|
+
verbose,
|
|
51
|
+
level=LogLevel.PROGRESS,
|
|
52
|
+
)
|
|
53
|
+
charter_text = clean_html(html)
|
|
54
|
+
|
|
55
|
+
if charter_text:
|
|
56
|
+
with open(output_file, "w", encoding="utf-8") as out_fh:
|
|
57
|
+
out_fh.write(f"Working Group Charter: {wg_name}\n")
|
|
58
|
+
out_fh.write(f"Source: {url}\n")
|
|
59
|
+
out_fh.write("=" * 80 + "\n\n")
|
|
60
|
+
out_fh.write(charter_text + "\n")
|
|
61
|
+
|
|
62
|
+
log(f"Done! Charter written to {output_file}.", verbose, level=LogLevel.STATUS)
|
|
63
|
+
return [output_file]
|
|
64
|
+
return []
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Dict, Any
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from .utils import LogLevel, Verbosity, log, fetch_resource
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_adopted_drafts(
|
|
9
|
+
wg_name: str, verbose: Verbosity = Verbosity.STATUS
|
|
10
|
+
) -> List[Dict[str, Any]]:
|
|
11
|
+
"""Scrape WG documents page for active drafts."""
|
|
12
|
+
url = f"https://datatracker.ietf.org/wg/{wg_name}/documents/"
|
|
13
|
+
log(f"Finding adopted drafts for {wg_name}...", verbose, level=LogLevel.STATUS)
|
|
14
|
+
res = fetch_resource(url)
|
|
15
|
+
if not res:
|
|
16
|
+
return []
|
|
17
|
+
|
|
18
|
+
soup = BeautifulSoup(res.text, "html.parser")
|
|
19
|
+
drafts = []
|
|
20
|
+
|
|
21
|
+
# Strategy: look for links starting with /doc/draft-ietf-{wg_name}-
|
|
22
|
+
# The link text contains the draft name + version
|
|
23
|
+
pattern = f"/doc/draft-ietf-{wg_name}-"
|
|
24
|
+
for a_tag in soup.find_all("a", href=True):
|
|
25
|
+
href = a_tag.get("href")
|
|
26
|
+
if not isinstance(href, str) or not href.startswith(pattern):
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
text = a_tag.get_text(strip=True)
|
|
30
|
+
# Text usually looks like "draft-ietf-aipref-vocab-05"
|
|
31
|
+
if text.startswith(f"draft-ietf-{wg_name}-"):
|
|
32
|
+
# Use regex to find "draft-..." + extension
|
|
33
|
+
match = re.search(
|
|
34
|
+
r"(draft-ietf-" + re.escape(wg_name) + r"-.*?)-(\d+)$", text
|
|
35
|
+
)
|
|
36
|
+
if match:
|
|
37
|
+
draft_name = match.group(1)
|
|
38
|
+
try:
|
|
39
|
+
current_rev = int(match.group(2))
|
|
40
|
+
drafts.append({"name": draft_name, "max_rev": current_rev})
|
|
41
|
+
except ValueError:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
# De-duplicate and keep the highest revision found
|
|
45
|
+
unique_drafts: Dict[str, int] = {}
|
|
46
|
+
for draft_entry in drafts:
|
|
47
|
+
name = str(draft_entry["name"])
|
|
48
|
+
rev = int(draft_entry["max_rev"])
|
|
49
|
+
if name not in unique_drafts or rev > unique_drafts[name]:
|
|
50
|
+
unique_drafts[name] = rev
|
|
51
|
+
|
|
52
|
+
result = [{"name": name, "max_rev": rev} for name, rev in unique_drafts.items()]
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def process_drafts(
|
|
57
|
+
wg_name: str,
|
|
58
|
+
destination: str,
|
|
59
|
+
force: bool = False,
|
|
60
|
+
verbose: Verbosity = Verbosity.STATUS,
|
|
61
|
+
) -> List[str]:
|
|
62
|
+
"""Download all revisions of WG drafts as text."""
|
|
63
|
+
updated = []
|
|
64
|
+
drafts = get_adopted_drafts(wg_name, verbose)
|
|
65
|
+
if not drafts:
|
|
66
|
+
log(f"No adopted drafts found for {wg_name}.", verbose, level=LogLevel.STATUS)
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
for draft in drafts:
|
|
70
|
+
name = draft["name"]
|
|
71
|
+
max_rev = draft["max_rev"]
|
|
72
|
+
log(
|
|
73
|
+
f"Processing draft: {name} (revs 00 to {max_rev:02d})",
|
|
74
|
+
verbose,
|
|
75
|
+
level=LogLevel.STATUS,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
for rev in range(max_rev + 1):
|
|
79
|
+
rev_str = f"{rev:02d}"
|
|
80
|
+
filename = f"{name}-{rev_str}.txt"
|
|
81
|
+
filepath = os.path.join(destination, filename)
|
|
82
|
+
|
|
83
|
+
if not force and os.path.exists(filepath):
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
url = f"https://www.ietf.org/archive/id/{name}-{rev_str}.txt"
|
|
87
|
+
log(f"Downloading {filename}...", verbose, level=LogLevel.PROGRESS)
|
|
88
|
+
res = fetch_resource(url)
|
|
89
|
+
if res:
|
|
90
|
+
with open(filepath, "w", encoding="utf-8") as out_fh:
|
|
91
|
+
out_fh.write(str(res.text))
|
|
92
|
+
updated.append(filepath)
|
|
93
|
+
|
|
94
|
+
return updated
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Optional, List, Dict, Any
|
|
5
|
+
import requests
|
|
6
|
+
from .utils import LogLevel, Verbosity, log
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def format_date(iso_date: Optional[str]) -> str:
|
|
10
|
+
"""Convert ISO date to a more readable format."""
|
|
11
|
+
if not iso_date:
|
|
12
|
+
return "(Unknown Date)"
|
|
13
|
+
try:
|
|
14
|
+
dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00"))
|
|
15
|
+
return dt.strftime("%Y-%m-%d %H:%M:%S %Z")
|
|
16
|
+
except (ValueError, TypeError):
|
|
17
|
+
return iso_date
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def process_github_issues(
|
|
21
|
+
input_file: str, output_file: str, verbose: Verbosity = Verbosity.STATUS
|
|
22
|
+
) -> List[str]:
|
|
23
|
+
"""Process a GitHub issues JSON archive and write cleaned text to output_file."""
|
|
24
|
+
log(f"Opening {input_file}...", verbose, level=LogLevel.PROGRESS)
|
|
25
|
+
try:
|
|
26
|
+
with open(input_file, "r", encoding="utf-8") as json_fh:
|
|
27
|
+
data = json.load(json_fh)
|
|
28
|
+
except (json.JSONDecodeError, OSError) as err:
|
|
29
|
+
log(f"Error parsing GitHub JSON: {err}", verbose, level=LogLevel.ERROR)
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
issues = data.get("issues", [])
|
|
33
|
+
repo_name = data.get("repo", "Unknown Repo")
|
|
34
|
+
|
|
35
|
+
with open(output_file, "w", encoding="utf-8") as out_fh:
|
|
36
|
+
out_fh.write(f"Repository: {repo_name}\n")
|
|
37
|
+
out_fh.write(f"Archive Export Date: {format_date(data.get('timestamp'))}\n")
|
|
38
|
+
out_fh.write("=" * 80 + "\n\n")
|
|
39
|
+
|
|
40
|
+
for issue in issues:
|
|
41
|
+
title = issue.get("title", "(No Title)")
|
|
42
|
+
number = issue.get("number", "?")
|
|
43
|
+
state = issue.get("state", "(Unknown State)")
|
|
44
|
+
author = issue.get("author", "(Unknown Author)")
|
|
45
|
+
created_at = format_date(issue.get("createdAt"))
|
|
46
|
+
labels = ", ".join(issue.get("labels", []))
|
|
47
|
+
body = (issue.get("body") or "").strip()
|
|
48
|
+
|
|
49
|
+
out_fh.write(f"Issue #{number}: {title}\n")
|
|
50
|
+
out_fh.write(f"State: {state}\n")
|
|
51
|
+
out_fh.write(f"Date: {created_at}\n")
|
|
52
|
+
out_fh.write(f"Author: {author}\n")
|
|
53
|
+
if labels:
|
|
54
|
+
out_fh.write(f"Labels: {labels}\n")
|
|
55
|
+
out_fh.write("\n")
|
|
56
|
+
|
|
57
|
+
out_fh.write((body or "(No description provided)") + "\n")
|
|
58
|
+
|
|
59
|
+
comments = issue.get("comments", [])
|
|
60
|
+
if comments:
|
|
61
|
+
out_fh.write("\n" + "-" * 40 + "\n")
|
|
62
|
+
out_fh.write(f"Comments ({len(comments)}):\n\n")
|
|
63
|
+
for comment in comments:
|
|
64
|
+
c_author = comment.get("author", "(Unknown)")
|
|
65
|
+
c_date = format_date(comment.get("createdAt"))
|
|
66
|
+
c_body = (comment.get("body") or "").strip()
|
|
67
|
+
|
|
68
|
+
out_fh.write(f"--- Comment by {c_author} on {c_date} ---\n")
|
|
69
|
+
out_fh.write(c_body + "\n\n")
|
|
70
|
+
|
|
71
|
+
out_fh.write("=" * 80 + "\n\n")
|
|
72
|
+
|
|
73
|
+
log(
|
|
74
|
+
f"Done! Extracted {len(issues)} issues to {output_file}.",
|
|
75
|
+
verbose,
|
|
76
|
+
level=LogLevel.STATUS,
|
|
77
|
+
)
|
|
78
|
+
return [output_file]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def download_github_issues(
|
|
82
|
+
repo_short: str,
|
|
83
|
+
dest_path: str,
|
|
84
|
+
token: Optional[str] = None,
|
|
85
|
+
verbose: Verbosity = Verbosity.STATUS,
|
|
86
|
+
) -> bool:
|
|
87
|
+
"""Download GitHub issues JSON using the API from 'owner/repo' short name."""
|
|
88
|
+
if repo_short.startswith("http"):
|
|
89
|
+
log(
|
|
90
|
+
f"Direct downloading GitHub issues from {repo_short}...",
|
|
91
|
+
verbose,
|
|
92
|
+
level=LogLevel.STATUS,
|
|
93
|
+
)
|
|
94
|
+
try:
|
|
95
|
+
response = requests.get(repo_short, timeout=60)
|
|
96
|
+
response.raise_for_status()
|
|
97
|
+
with open(dest_path, "w", encoding="utf-8") as json_file:
|
|
98
|
+
json_file.write(response.text)
|
|
99
|
+
return True
|
|
100
|
+
except (requests.RequestException, OSError) as err:
|
|
101
|
+
log(
|
|
102
|
+
f"Error downloading GitHub issues: {err}", verbose, level=LogLevel.ERROR
|
|
103
|
+
)
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
# Expecting owner/repo
|
|
107
|
+
if "/" not in repo_short:
|
|
108
|
+
log(
|
|
109
|
+
f"Invalid GitHub short name: {repo_short}. Expected 'owner/repo'.",
|
|
110
|
+
verbose,
|
|
111
|
+
level=LogLevel.ERROR,
|
|
112
|
+
)
|
|
113
|
+
return False
|
|
114
|
+
owner, repo = repo_short.split("/", 1)
|
|
115
|
+
archive_url = f"https://{owner}.github.io/{repo}/archive.json"
|
|
116
|
+
|
|
117
|
+
log(
|
|
118
|
+
f"Checking for GitHub archive at {archive_url}...",
|
|
119
|
+
verbose,
|
|
120
|
+
level=LogLevel.STATUS,
|
|
121
|
+
)
|
|
122
|
+
try:
|
|
123
|
+
response = requests.get(archive_url, timeout=30)
|
|
124
|
+
if response.status_code == 200:
|
|
125
|
+
log("Archive found; downloading...", verbose, level=LogLevel.STATUS)
|
|
126
|
+
try:
|
|
127
|
+
archive_data = response.json()
|
|
128
|
+
# Ensure it's in our expected format (dict with 'issues' key)
|
|
129
|
+
if isinstance(archive_data, list):
|
|
130
|
+
archive_data = {
|
|
131
|
+
"repo": f"{owner}/{repo}",
|
|
132
|
+
"timestamp": datetime.now().isoformat(),
|
|
133
|
+
"issues": archive_data,
|
|
134
|
+
}
|
|
135
|
+
elif "issues" not in archive_data:
|
|
136
|
+
# If it's a dict but missing 'issues', we might still want to wrap it
|
|
137
|
+
# or handle it differently. For now, assume it might be a single issue
|
|
138
|
+
# or some other format and wrap if it's not our expected schema.
|
|
139
|
+
archive_data = {
|
|
140
|
+
"repo": f"{owner}/{repo}",
|
|
141
|
+
"timestamp": datetime.now().isoformat(),
|
|
142
|
+
"issues": [archive_data],
|
|
143
|
+
}
|
|
144
|
+
with open(dest_path, "w", encoding="utf-8") as json_fh:
|
|
145
|
+
json.dump(archive_data, json_fh, indent=2)
|
|
146
|
+
return True
|
|
147
|
+
except (json.JSONDecodeError, TypeError) as err:
|
|
148
|
+
log(f"Error parsing archive JSON: {err}", Verbosity.VERBOSE, level=LogLevel.STATUS)
|
|
149
|
+
log("No archive found on gh-pages.", verbose, level=LogLevel.PROGRESS)
|
|
150
|
+
except (requests.RequestException, OSError) as err:
|
|
151
|
+
log(
|
|
152
|
+
f"Error checking gh-pages archive: {err}",
|
|
153
|
+
verbose,
|
|
154
|
+
level=LogLevel.STATUS,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
log(
|
|
158
|
+
f"Fetching GitHub issues via API for {owner}/{repo}...",
|
|
159
|
+
verbose,
|
|
160
|
+
level=LogLevel.STATUS,
|
|
161
|
+
)
|
|
162
|
+
headers = {"Accept": "application/vnd.github.v3+json"}
|
|
163
|
+
github_token = token or os.environ.get("GITHUB_TOKEN")
|
|
164
|
+
if github_token:
|
|
165
|
+
headers["Authorization"] = f"token {github_token}"
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
all_issues = _fetch_all_issues(owner, repo, headers, verbose)
|
|
169
|
+
export_data = {
|
|
170
|
+
"repo": f"{owner}/{repo}",
|
|
171
|
+
"timestamp": datetime.now().isoformat(),
|
|
172
|
+
"issues": all_issues,
|
|
173
|
+
}
|
|
174
|
+
with open(dest_path, "w", encoding="utf-8") as json_fh:
|
|
175
|
+
json.dump(export_data, json_fh, indent=2)
|
|
176
|
+
return True
|
|
177
|
+
except (requests.RequestException, OSError) as err:
|
|
178
|
+
log(f"Error fetching GitHub issues: {err}", verbose, level=LogLevel.ERROR)
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _fetch_all_issues(
|
|
183
|
+
owner: str, repo_name: str, headers: Dict[str, str], verbose: Verbosity
|
|
184
|
+
) -> List[Dict[str, Any]]:
|
|
185
|
+
"""Fetch all issues and their comments from GitHub API."""
|
|
186
|
+
all_issues = []
|
|
187
|
+
page = 1
|
|
188
|
+
while True:
|
|
189
|
+
api_url = (
|
|
190
|
+
f"https://api.github.com/repos/{owner}/{repo_name}/issues"
|
|
191
|
+
f"?state=all&page={page}&per_page=100"
|
|
192
|
+
)
|
|
193
|
+
res = requests.get(api_url, headers=headers, timeout=60)
|
|
194
|
+
res.raise_for_status()
|
|
195
|
+
issues = res.json()
|
|
196
|
+
if not issues:
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
for issue in issues:
|
|
200
|
+
# GitHub API returns both issues and PRs (PRs have a 'pull_request' key)
|
|
201
|
+
if "pull_request" in issue:
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
issue_data = {
|
|
205
|
+
"number": issue.get("number"),
|
|
206
|
+
"title": issue.get("title"),
|
|
207
|
+
"state": issue.get("state"),
|
|
208
|
+
"author": issue.get("user", {}).get("login"),
|
|
209
|
+
"createdAt": issue.get("created_at"),
|
|
210
|
+
"labels": [l.get("name") for l in issue.get("labels", [])],
|
|
211
|
+
"body": issue.get("body"),
|
|
212
|
+
"comments": [],
|
|
213
|
+
}
|
|
214
|
+
if issue.get("comments", 0) > 0:
|
|
215
|
+
issue_data["comments"] = _fetch_issue_comments(
|
|
216
|
+
issue.get("comments_url"), headers
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
all_issues.append(issue_data)
|
|
220
|
+
|
|
221
|
+
page += 1
|
|
222
|
+
if len(issues) < 100:
|
|
223
|
+
break
|
|
224
|
+
return all_issues
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _fetch_issue_comments(
|
|
228
|
+
comments_url: str, headers: Dict[str, str]
|
|
229
|
+
) -> List[Dict[str, Any]]:
|
|
230
|
+
"""Fetch comments for a specific issue."""
|
|
231
|
+
c_res = requests.get(comments_url, headers=headers, timeout=30)
|
|
232
|
+
if c_res.status_code == 200:
|
|
233
|
+
comments = c_res.json()
|
|
234
|
+
return [
|
|
235
|
+
{
|
|
236
|
+
"author": comment.get("user", {}).get("login"),
|
|
237
|
+
"createdAt": comment.get("created_at"),
|
|
238
|
+
"body": comment.get("body"),
|
|
239
|
+
}
|
|
240
|
+
for comment in comments
|
|
241
|
+
]
|
|
242
|
+
return []
|