recent-state-summarizer 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. recent_state_summarizer-0.0.8/LICENSE +21 -0
  2. recent_state_summarizer-0.0.8/PKG-INFO +112 -0
  3. recent_state_summarizer-0.0.8/README.md +76 -0
  4. recent_state_summarizer-0.0.8/pyproject.toml +48 -0
  5. recent_state_summarizer-0.0.8/recent_state_summarizer/__init__.py +1 -0
  6. recent_state_summarizer-0.0.8/recent_state_summarizer/__main__.py +93 -0
  7. recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/__init__.py +153 -0
  8. recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/adventar.py +59 -0
  9. recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/hatena_blog.py +39 -0
  10. recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/hatena_bookmark.py +34 -0
  11. recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/qiita_advent_calendar.py +58 -0
  12. recent_state_summarizer-0.0.8/recent_state_summarizer/summarize.py +76 -0
  13. recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/PKG-INFO +112 -0
  14. recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/SOURCES.txt +25 -0
  15. recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/dependency_links.txt +1 -0
  16. recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/entry_points.txt +2 -0
  17. recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/requires.txt +19 -0
  18. recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/top_level.txt +3 -0
  19. recent_state_summarizer-0.0.8/setup.cfg +4 -0
  20. recent_state_summarizer-0.0.8/setup.py +3 -0
  21. recent_state_summarizer-0.0.8/tests/fetch/__init__.py +0 -0
  22. recent_state_summarizer-0.0.8/tests/fetch/test_adventar.py +130 -0
  23. recent_state_summarizer-0.0.8/tests/fetch/test_core.py +57 -0
  24. recent_state_summarizer-0.0.8/tests/fetch/test_hatena_blog.py +195 -0
  25. recent_state_summarizer-0.0.8/tests/fetch/test_hatena_bookmark.py +61 -0
  26. recent_state_summarizer-0.0.8/tests/fetch/test_qiita_advent_calendar.py +118 -0
  27. recent_state_summarizer-0.0.8/tests/test_main.py +163 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 nikkie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,112 @@
1
+ Metadata-Version: 2.4
2
+ Name: recent-state-summarizer
3
+ Version: 0.0.8
4
+ Summary: Summarize a list of entry titles using LLM
5
+ Author-email: nikkie <takuyafjp+develop@gmail.com>
6
+ License: MIT
7
+ Classifier: Development Status :: 1 - Planning
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: beautifulsoup4
20
+ Requires-Dist: feedparser
21
+ Requires-Dist: httpx
22
+ Requires-Dist: openai<1
23
+ Provides-Extra: testing
24
+ Requires-Dist: pytest; extra == "testing"
25
+ Requires-Dist: responses; extra == "testing"
26
+ Requires-Dist: respx; extra == "testing"
27
+ Provides-Extra: lint
28
+ Requires-Dist: flake8; extra == "lint"
29
+ Requires-Dist: black; extra == "lint"
30
+ Requires-Dist: isort; extra == "lint"
31
+ Provides-Extra: dev
32
+ Requires-Dist: wheel; extra == "dev"
33
+ Requires-Dist: build; extra == "dev"
34
+ Requires-Dist: twine; extra == "dev"
35
+ Dynamic: license-file
36
+
37
+ # recent-state-summarizer
38
+
39
+ Summarize blog article titles with the OpenAI API
40
+
41
+ a.k.a. _RSS_ 😃
42
+
43
+ ## Setup
44
+
45
+ ```
46
+ $ pip install recent-state-summarizer
47
+ ```
48
+
49
+ ⚠️ Set `OPENAI_API_KEY` environment variable.
50
+ ref: https://platform.openai.com/account/api-keys
51
+
52
+ ## Usage
53
+
54
+ ```
55
+ $ omae-douyo https://nikkie-ftnext.hatenablog.com/archive/2023/4
56
+
57
+ この人物は最近、プログラミングに関することを中心にして活動しています。
58
+
59
+ (略)
60
+
61
+ 最近は、株式会社はてなに入社したようです。
62
+ ```
63
+
64
+ Currently support:
65
+
66
+ - はてなブログ(Hatena blog)
67
+ - はてなブックマークRSS
68
+ - Adventar
69
+ - Qiita Advent Calendar
70
+
71
+ To see help, type `omae-douyo -h`.
72
+
73
+ ### Fetch only (save to file)
74
+
75
+ Fetch titles and URLs of articles, and save them to a file without summarization:
76
+
77
+ ```
78
+ # Save as JSON format (default)
79
+ $ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 articles.jsonl
80
+
81
+ # Save as bullet list
82
+ $ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 titles.txt --as-title-list
83
+ ```
84
+
85
+ ## Development
86
+
87
+ ### Sub commands
88
+
89
+ Fetch only (same as `omae-douyo fetch`):
90
+
91
+ ```
92
+ python -m recent_state_summarizer.fetch -h
93
+ ```
94
+
95
+ Summarize only:
96
+ It's convenient to omit fetching in tuning the prompt.
97
+
98
+ ```
99
+ python -m recent_state_summarizer.summarize -h
100
+ ```
101
+
102
+ ### Environment
103
+
104
+ ```
105
+ $ git clone https://github.com/ftnext/recent-state-summarizer.git
106
+ $ cd recent-state-summarizer
107
+
108
+ $ python -m venv venv
109
+ $ source venv/bin/activate
110
+ (venv) $ pip install -r requirements.lock
111
+ (venv) $ pip install -e '.'
112
+ ```
@@ -0,0 +1,76 @@
1
+ # recent-state-summarizer
2
+
3
+ Summarize blog article titles with the OpenAI API
4
+
5
+ a.k.a. _RSS_ 😃
6
+
7
+ ## Setup
8
+
9
+ ```
10
+ $ pip install recent-state-summarizer
11
+ ```
12
+
13
+ ⚠️ Set `OPENAI_API_KEY` environment variable.
14
+ ref: https://platform.openai.com/account/api-keys
15
+
16
+ ## Usage
17
+
18
+ ```
19
+ $ omae-douyo https://nikkie-ftnext.hatenablog.com/archive/2023/4
20
+
21
+ この人物は最近、プログラミングに関することを中心にして活動しています。
22
+
23
+ (略)
24
+
25
+ 最近は、株式会社はてなに入社したようです。
26
+ ```
27
+
28
+ Currently support:
29
+
30
+ - はてなブログ(Hatena blog)
31
+ - はてなブックマークRSS
32
+ - Adventar
33
+ - Qiita Advent Calendar
34
+
35
+ To see help, type `omae-douyo -h`.
36
+
37
+ ### Fetch only (save to file)
38
+
39
+ Fetch titles and URLs of articles, and save them to a file without summarization:
40
+
41
+ ```
42
+ # Save as JSON format (default)
43
+ $ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 articles.jsonl
44
+
45
+ # Save as bullet list
46
+ $ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 titles.txt --as-title-list
47
+ ```
48
+
49
+ ## Development
50
+
51
+ ### Sub commands
52
+
53
+ Fetch only (same as `omae-douyo fetch`):
54
+
55
+ ```
56
+ python -m recent_state_summarizer.fetch -h
57
+ ```
58
+
59
+ Summarize only:
60
+ It's convenient to omit fetching in tuning the prompt.
61
+
62
+ ```
63
+ python -m recent_state_summarizer.summarize -h
64
+ ```
65
+
66
+ ### Environment
67
+
68
+ ```
69
+ $ git clone https://github.com/ftnext/recent-state-summarizer.git
70
+ $ cd recent-state-summarizer
71
+
72
+ $ python -m venv venv
73
+ $ source venv/bin/activate
74
+ (venv) $ pip install -r requirements.lock
75
+ (venv) $ pip install -e '.'
76
+ ```
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "recent-state-summarizer"
7
+ authors = [
8
+ {name = "nikkie", email = "takuyafjp+develop@gmail.com"},
9
+ ]
10
+ description = "Summarize a list of entry titles using LLM"
11
+ readme = {file = "README.md", content-type = "text/markdown"}
12
+ requires-python = ">=3.10"
13
+ license = {text = "MIT"}
14
+ classifiers = [
15
+ "Development Status :: 1 - Planning",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Programming Language :: Python :: 3.14",
24
+ ]
25
+ dependencies = [
26
+ "beautifulsoup4",
27
+ "feedparser",
28
+ "httpx",
29
+ "openai<1",
30
+ ]
31
+ dynamic = ["version"]
32
+
33
+ [project.optional-dependencies]
34
+ testing = ["pytest", "responses", "respx"]
35
+ lint = ["flake8", "black", "isort"]
36
+ dev = ["wheel", "build", "twine"]
37
+
38
+ [project.scripts]
39
+ omae-douyo = "recent_state_summarizer.__main__:main"
40
+
41
+ [tool.setuptools.packages.find]
42
+ exclude = ["tests"]
43
+
44
+ [tool.setuptools.dynamic]
45
+ version = {attr = "recent_state_summarizer.__version__"}
46
+
47
+ [tool.pytest]
48
+ testpaths = ["tests"]
@@ -0,0 +1 @@
1
+ __version__ = "0.0.8"
@@ -0,0 +1,93 @@
1
+ import argparse
2
+ import sys
3
+ import tempfile
4
+ from textwrap import dedent
5
+
6
+ from recent_state_summarizer.fetch import _main as fetch_main
7
+ from recent_state_summarizer.fetch import build_parser as build_fetch_parser
8
+ from recent_state_summarizer.summarize import summarize_titles
9
+
10
+
11
+ def build_parser():
12
+ help_message = """
13
+ Summarize blog article titles with the OpenAI API.
14
+
15
+ ⚠️ Set `OPENAI_API_KEY` environment variable.
16
+
17
+ Example:
18
+ omae-douyo https://awesome.hatenablog.com/archive/2023
19
+
20
+ Retrieve the titles of articles from a specified URL.
21
+ After summarization, prints the summary.
22
+ """
23
+ parser = argparse.ArgumentParser(
24
+ formatter_class=argparse.RawDescriptionHelpFormatter,
25
+ description=dedent(help_message),
26
+ )
27
+ subparsers = parser.add_subparsers(dest="subcommand")
28
+
29
+ run_parser = subparsers.add_parser(
30
+ "run", help="Fetch article titles and generate summary (default)"
31
+ )
32
+ run_parser.add_argument("url", help="URL of archive page")
33
+ run_parser.set_defaults(func=run_cli)
34
+
35
+ fetch_help_message = """
36
+ Retrieve the titles and URLs of articles from a web page specified by URL
37
+ and save them as JSON Lines format.
38
+
39
+ Support:
40
+ - はてなブログ(Hatena blog)
41
+ - はてなブックマークRSS
42
+ - Adventar
43
+ - Qiita Advent Calendar
44
+
45
+ Example:
46
+ omae-douyo fetch https://awesome.hatenablog.com/archive/2023 articles.jsonl
47
+ """
48
+ fetch_parser = subparsers.add_parser(
49
+ "fetch",
50
+ parents=[build_fetch_parser(add_help=False)],
51
+ help="Fetch article titles only and save to file",
52
+ formatter_class=argparse.RawDescriptionHelpFormatter,
53
+ description=dedent(fetch_help_message),
54
+ )
55
+ fetch_parser.set_defaults(func=fetch_cli)
56
+
57
+ return parser
58
+
59
+
60
+ def run_cli(args):
61
+ with tempfile.NamedTemporaryFile(mode="w+") as tempf:
62
+ fetch_main(args.url, tempf.name, save_as_title_list=True)
63
+ tempf.seek(0)
64
+ titles = tempf.read()
65
+ summary = summarize_titles(titles)
66
+ print(summary)
67
+
68
+
69
+ def fetch_cli(args):
70
+ fetch_main(args.url, args.save_path, save_as_title_list=args.as_title_list)
71
+
72
+
73
+ def normalize_argv() -> list[str]:
74
+ argv = sys.argv[1:]
75
+ if len(argv) == 0:
76
+ return ["--help"]
77
+
78
+ help_flags = {"-h", "--help"}
79
+ if argv[0] in help_flags:
80
+ return argv
81
+
82
+ known_subcommands = {"run", "fetch"}
83
+ if argv[0] not in known_subcommands:
84
+ return ["run"] + argv
85
+
86
+ return argv
87
+
88
+
89
+ def main():
90
+ parser = build_parser()
91
+ argv = normalize_argv()
92
+ args = parser.parse_args(argv)
93
+ args.func(args)
@@ -0,0 +1,153 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import logging
6
+ import textwrap
7
+ from collections.abc import Iterable
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from urllib.parse import urlparse
11
+
12
+ from recent_state_summarizer.fetch.adventar import (
13
+ TitleTag,
14
+ fetch_adventar_calendar,
15
+ )
16
+ from recent_state_summarizer.fetch.hatena_blog import _fetch_titles
17
+ from recent_state_summarizer.fetch.hatena_bookmark import (
18
+ fetch_hatena_bookmark_rss,
19
+ )
20
+ from recent_state_summarizer.fetch.qiita_advent_calendar import (
21
+ fetch_qiita_advent_calendar,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class URLType(Enum):
28
+ """Type of URL for fetching."""
29
+
30
+ HATENA_BLOG = "hatena_blog"
31
+ HATENA_BOOKMARK_RSS = "hatena_bookmark_rss"
32
+ ADVENTAR = "adventar"
33
+ QIITA_ADVENT_CALENDAR = "qiita_advent_calendar"
34
+ UNKNOWN = "unknown"
35
+
36
+
37
+ def _detect_url_type(url: str) -> URLType:
38
+ """Detect the type of URL to determine fetch strategy.
39
+
40
+ Args:
41
+ url: URL to analyze
42
+
43
+ Returns:
44
+ URLType indicating the fetch strategy to use
45
+ """
46
+ parsed = urlparse(url)
47
+ if (
48
+ parsed.netloc == "b.hatena.ne.jp"
49
+ and parsed.path.startswith("/entrylist/")
50
+ and parsed.path.endswith(".rss")
51
+ ):
52
+ return URLType.HATENA_BOOKMARK_RSS
53
+
54
+ if parsed.netloc == "qiita.com" and "/advent-calendar/" in parsed.path:
55
+ return URLType.QIITA_ADVENT_CALENDAR
56
+
57
+ if "/calendars/" in parsed.path or "adventar.org" in parsed.netloc:
58
+ return URLType.ADVENTAR
59
+
60
+ if (
61
+ "hatenablog.com" in url
62
+ or "hateblo.jp" in url
63
+ or "/archive/" in parsed.path
64
+ ):
65
+ return URLType.HATENA_BLOG
66
+
67
+ return URLType.UNKNOWN
68
+
69
+
70
+ def _select_fetcher(url_type):
71
+ match url_type:
72
+ case URLType.HATENA_BOOKMARK_RSS:
73
+ return fetch_hatena_bookmark_rss
74
+ case URLType.HATENA_BLOG:
75
+ return _fetch_titles
76
+ case URLType.ADVENTAR:
77
+ return fetch_adventar_calendar
78
+ case URLType.QIITA_ADVENT_CALENDAR:
79
+ return fetch_qiita_advent_calendar
80
+ case _:
81
+ raise ValueError(f"Unsupported URL type: {url_type}")
82
+
83
+
84
+ def _main(
85
+ url: str, save_path: str | Path, *, save_as_title_list: bool
86
+ ) -> None:
87
+ url_type = _detect_url_type(url)
88
+ fetcher = _select_fetcher(url_type)
89
+ title_tags = fetcher(url)
90
+ if save_as_title_list:
91
+ contents = _as_bullet_list(
92
+ title_tag["title"] for title_tag in title_tags
93
+ )
94
+ else:
95
+ contents = _as_json(title_tags)
96
+ _save(save_path, contents)
97
+
98
+
99
+ def _as_bullet_list(titles: Iterable[str]) -> str:
100
+ return "\n".join(f"- {title}" for title in titles)
101
+
102
+
103
+ def _as_json(title_tags: Iterable[TitleTag]) -> str:
104
+ return "\n".join(
105
+ json.dumps(title_tag, ensure_ascii=False) for title_tag in title_tags
106
+ )
107
+
108
+
109
+ def _save(path: str | Path, contents: str) -> None:
110
+ with open(path, "w", encoding="utf8", newline="") as f:
111
+ f.write(contents)
112
+
113
+
114
+ def build_parser(add_help: bool = True) -> argparse.ArgumentParser:
115
+ help_message = """
116
+ Retrieve the titles and URLs of articles from a web page specified by URL
117
+ and save them as JSON Lines format.
118
+
119
+ Support:
120
+ - はてなブログ(Hatena blog)
121
+ - はてなブックマークRSS
122
+ - Adventar
123
+ - Qiita Advent Calendar
124
+
125
+ Example:
126
+ python -m recent_state_summarizer.fetch \\
127
+ https://awesome.hatenablog.com/archive/2023 articles.jsonl
128
+ """
129
+ parser = argparse.ArgumentParser(
130
+ formatter_class=argparse.RawDescriptionHelpFormatter,
131
+ description=textwrap.dedent(help_message),
132
+ add_help=add_help,
133
+ )
134
+ parser.add_argument("url", help="URL of archive page")
135
+ parser.add_argument("save_path", help="Local file path")
136
+ parser.add_argument(
137
+ "--as-title-list",
138
+ action="store_true",
139
+ default=False,
140
+ help="Save as title-only bullet list instead of JSON Lines",
141
+ )
142
+ return parser
143
+
144
+
145
+ def cli():
146
+ parser = build_parser()
147
+ args = parser.parse_args()
148
+
149
+ _main(args.url, args.save_path, save_as_title_list=args.as_title_list)
150
+
151
+
152
+ if __name__ == "__main__":
153
+ cli()
@@ -0,0 +1,59 @@
1
+ from collections.abc import Generator
2
+ from typing import TypedDict
3
+
4
+ import httpx
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ class TitleTag(TypedDict):
9
+ title: str
10
+ url: str
11
+
12
+
13
+ def _fetch(url: str) -> str:
14
+ response = httpx.get(url)
15
+ response.raise_for_status()
16
+ return response.text
17
+
18
+
19
+ def fetch_adventar_calendar(url: str) -> Generator[TitleTag, None, None]:
20
+ """Fetch article titles and URLs from Adventar calendar.
21
+
22
+ Args:
23
+ url: Adventar calendar URL (e.g., https://adventar.org/calendars/11474)
24
+
25
+ Yields:
26
+ TitleTag dictionaries containing title and url
27
+ """
28
+ raw_html = _fetch(url)
29
+ yield from _parse_titles(raw_html)
30
+
31
+
32
+ def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
33
+ """Parse titles from Adventar calendar HTML."""
34
+ soup = BeautifulSoup(raw_html, "html.parser")
35
+ entry_list = soup.find("ul", class_="EntryList")
36
+ if not entry_list:
37
+ return
38
+
39
+ items = entry_list.find_all("li", class_="item")
40
+ for item in items:
41
+ article = item.find("div", class_="article")
42
+ if not article:
43
+ continue
44
+
45
+ link_div = article.find("div", class_="link")
46
+ if not link_div:
47
+ continue
48
+
49
+ link = link_div.find("a")
50
+ if not link or "href" not in link.attrs:
51
+ continue
52
+
53
+ title_div = link_div.find_next_sibling("div")
54
+ if title_div and title_div.text.strip():
55
+ title = title_div.text.strip()
56
+ else:
57
+ title = link.text.strip()
58
+
59
+ yield {"title": title, "url": link["href"]}
@@ -0,0 +1,39 @@
1
+ from collections.abc import Generator
2
+ from typing import TypedDict
3
+
4
+ import httpx
5
+ from bs4 import BeautifulSoup
6
+
7
+ PARSE_HATENABLOG_KWARGS = {"name": "a", "attrs": {"class": "entry-title-link"}}
8
+
9
+
10
+ class TitleTag(TypedDict):
11
+ title: str
12
+ url: str
13
+
14
+
15
+ def _fetch(url: str) -> str:
16
+ with httpx.Client() as client:
17
+ response = client.get(url)
18
+ response.raise_for_status()
19
+ return response.text
20
+
21
+
22
+ def _fetch_titles(url: str) -> Generator[TitleTag, None, None]:
23
+ raw_html = _fetch(url)
24
+ yield from _parse_titles(raw_html)
25
+
26
+ soup = BeautifulSoup(raw_html, "html.parser")
27
+ next_link = soup.find("a", class_="test-pager-next")
28
+ if next_link and "href" in next_link.attrs:
29
+ next_url = next_link["href"]
30
+ print(f"Next page found, fetching... {next_url}")
31
+ yield from _fetch_titles(next_url)
32
+
33
+
34
+ def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
35
+ soup = BeautifulSoup(raw_html, "html.parser")
36
+ body = soup.body
37
+ title_tags = body.find_all(**PARSE_HATENABLOG_KWARGS)
38
+ for title_tag in title_tags:
39
+ yield {"title": title_tag.text, "url": title_tag["href"]}
@@ -0,0 +1,34 @@
1
+ from typing import Generator, TypedDict
2
+
3
+ import feedparser
4
+ import httpx
5
+
6
+
7
+ class BookmarkEntry(TypedDict):
8
+ title: str
9
+ url: str
10
+ description: str
11
+
12
+
13
+ def fetch_hatena_bookmark_rss(
14
+ url: str,
15
+ ) -> Generator[BookmarkEntry, None, None]:
16
+ """Fetch entries from Hatena Bookmark RSS feed.
17
+
18
+ Args:
19
+ url: URL of the Hatena Bookmark RSS feed
20
+
21
+ Yields:
22
+ Bookmark entries with title, url, and description
23
+ """
24
+ response = httpx.get(url)
25
+ response.raise_for_status()
26
+
27
+ feed = feedparser.parse(response.content)
28
+
29
+ for entry in feed.entries:
30
+ yield {
31
+ "title": entry.title,
32
+ "url": entry.link,
33
+ "description": entry.description,
34
+ }
@@ -0,0 +1,58 @@
1
+ import json
2
+ from collections.abc import Generator
3
+ from typing import TypedDict
4
+
5
+ import httpx
6
+ from bs4 import BeautifulSoup
7
+
8
+
9
+ class TitleTag(TypedDict):
10
+ title: str
11
+ url: str
12
+
13
+
14
+ def _fetch(url: str) -> str:
15
+ response = httpx.get(url)
16
+ response.raise_for_status()
17
+ return response.text
18
+
19
+
20
+ def fetch_qiita_advent_calendar(url: str) -> Generator[TitleTag, None, None]:
21
+ """Fetch article titles and URLs from Qiita Advent Calendar.
22
+
23
+ Args:
24
+ url: Qiita Advent Calendar URL (e.g., https://qiita.com/advent-calendar/2025/python-type-hints)
25
+
26
+ Yields:
27
+ TitleTag dictionaries containing title and url
28
+ """
29
+ raw_html = _fetch(url)
30
+ yield from _parse_titles(raw_html)
31
+
32
+
33
+ def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
34
+ """Parse titles from Qiita Advent Calendar HTML by extracting JSON data."""
35
+ soup = BeautifulSoup(raw_html, "html.parser")
36
+ script_tag = soup.find(
37
+ "script",
38
+ attrs={"data-js-react-on-rails-store": "AppStoreWithReactOnRails"},
39
+ )
40
+ if not script_tag or not script_tag.string:
41
+ return
42
+
43
+ data = json.loads(script_tag.string)
44
+ advent_calendars = data.get("adventCalendars", {})
45
+ table_calendars = advent_calendars.get("tableAdventCalendars", [])
46
+ if not table_calendars:
47
+ return
48
+ items = table_calendars[0].get("items", [])
49
+
50
+ for item in items:
51
+ if not item.get("isRevealed", False):
52
+ continue
53
+
54
+ title = item.get("comment")
55
+ article_url = item.get("url")
56
+
57
+ if title and article_url:
58
+ yield {"title": title, "url": article_url}