recent-state-summarizer 0.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recent_state_summarizer-0.0.8/LICENSE +21 -0
- recent_state_summarizer-0.0.8/PKG-INFO +112 -0
- recent_state_summarizer-0.0.8/README.md +76 -0
- recent_state_summarizer-0.0.8/pyproject.toml +48 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/__init__.py +1 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/__main__.py +93 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/__init__.py +153 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/adventar.py +59 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/hatena_blog.py +39 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/hatena_bookmark.py +34 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/fetch/qiita_advent_calendar.py +58 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer/summarize.py +76 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/PKG-INFO +112 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/SOURCES.txt +25 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/dependency_links.txt +1 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/entry_points.txt +2 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/requires.txt +19 -0
- recent_state_summarizer-0.0.8/recent_state_summarizer.egg-info/top_level.txt +3 -0
- recent_state_summarizer-0.0.8/setup.cfg +4 -0
- recent_state_summarizer-0.0.8/setup.py +3 -0
- recent_state_summarizer-0.0.8/tests/fetch/__init__.py +0 -0
- recent_state_summarizer-0.0.8/tests/fetch/test_adventar.py +130 -0
- recent_state_summarizer-0.0.8/tests/fetch/test_core.py +57 -0
- recent_state_summarizer-0.0.8/tests/fetch/test_hatena_blog.py +195 -0
- recent_state_summarizer-0.0.8/tests/fetch/test_hatena_bookmark.py +61 -0
- recent_state_summarizer-0.0.8/tests/fetch/test_qiita_advent_calendar.py +118 -0
- recent_state_summarizer-0.0.8/tests/test_main.py +163 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 nikkie
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: recent-state-summarizer
|
|
3
|
+
Version: 0.0.8
|
|
4
|
+
Summary: Summarize a list of entry titles using LLM
|
|
5
|
+
Author-email: nikkie <takuyafjp+develop@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 1 - Planning
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Programming Language :: Python
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: beautifulsoup4
|
|
20
|
+
Requires-Dist: feedparser
|
|
21
|
+
Requires-Dist: httpx
|
|
22
|
+
Requires-Dist: openai<1
|
|
23
|
+
Provides-Extra: testing
|
|
24
|
+
Requires-Dist: pytest; extra == "testing"
|
|
25
|
+
Requires-Dist: responses; extra == "testing"
|
|
26
|
+
Requires-Dist: respx; extra == "testing"
|
|
27
|
+
Provides-Extra: lint
|
|
28
|
+
Requires-Dist: flake8; extra == "lint"
|
|
29
|
+
Requires-Dist: black; extra == "lint"
|
|
30
|
+
Requires-Dist: isort; extra == "lint"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: wheel; extra == "dev"
|
|
33
|
+
Requires-Dist: build; extra == "dev"
|
|
34
|
+
Requires-Dist: twine; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# recent-state-summarizer
|
|
38
|
+
|
|
39
|
+
Summarize blog article titles with the OpenAI API
|
|
40
|
+
|
|
41
|
+
a.k.a. _RSS_ 😃
|
|
42
|
+
|
|
43
|
+
## Setup
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
$ pip install recent-state-summarizer
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
⚠️ Set `OPENAI_API_KEY` environment variable.
|
|
50
|
+
ref: https://platform.openai.com/account/api-keys
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
$ omae-douyo https://nikkie-ftnext.hatenablog.com/archive/2023/4
|
|
56
|
+
|
|
57
|
+
この人物は最近、プログラミングに関することを中心にして活動しています。
|
|
58
|
+
|
|
59
|
+
(略)
|
|
60
|
+
|
|
61
|
+
最近は、株式会社はてなに入社したようです。
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Currently support:
|
|
65
|
+
|
|
66
|
+
- はてなブログ(Hatena blog)
|
|
67
|
+
- はてなブックマークRSS
|
|
68
|
+
- Adventar
|
|
69
|
+
- Qiita Advent Calendar
|
|
70
|
+
|
|
71
|
+
To see help, type `omae-douyo -h`.
|
|
72
|
+
|
|
73
|
+
### Fetch only (save to file)
|
|
74
|
+
|
|
75
|
+
Fetch titles and URLs of articles, and save them to a file without summarization:
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
# Save as JSON format (default)
|
|
79
|
+
$ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 articles.jsonl
|
|
80
|
+
|
|
81
|
+
# Save as bullet list
|
|
82
|
+
$ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 titles.txt --as-title-list
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
### Sub commands
|
|
88
|
+
|
|
89
|
+
Fetch only (same as `omae-douyo fetch`):
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
python -m recent_state_summarizer.fetch -h
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Summarize only:
|
|
96
|
+
It's convenient to omit fetching in tuning the prompt.
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
python -m recent_state_summarizer.summarize -h
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Environment
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
$ git clone https://github.com/ftnext/recent-state-summarizer.git
|
|
106
|
+
$ cd recent-state-summarizer
|
|
107
|
+
|
|
108
|
+
$ python -m venv venv
|
|
109
|
+
$ source venv/bin/activate
|
|
110
|
+
(venv) $ pip install -r requirements.lock
|
|
111
|
+
(venv) $ pip install -e '.'
|
|
112
|
+
```
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# recent-state-summarizer
|
|
2
|
+
|
|
3
|
+
Summarize blog article titles with the OpenAI API
|
|
4
|
+
|
|
5
|
+
a.k.a. _RSS_ 😃
|
|
6
|
+
|
|
7
|
+
## Setup
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
$ pip install recent-state-summarizer
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
⚠️ Set `OPENAI_API_KEY` environment variable.
|
|
14
|
+
ref: https://platform.openai.com/account/api-keys
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
$ omae-douyo https://nikkie-ftnext.hatenablog.com/archive/2023/4
|
|
20
|
+
|
|
21
|
+
この人物は最近、プログラミングに関することを中心にして活動しています。
|
|
22
|
+
|
|
23
|
+
(略)
|
|
24
|
+
|
|
25
|
+
最近は、株式会社はてなに入社したようです。
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Currently support:
|
|
29
|
+
|
|
30
|
+
- はてなブログ(Hatena blog)
|
|
31
|
+
- はてなブックマークRSS
|
|
32
|
+
- Adventar
|
|
33
|
+
- Qiita Advent Calendar
|
|
34
|
+
|
|
35
|
+
To see help, type `omae-douyo -h`.
|
|
36
|
+
|
|
37
|
+
### Fetch only (save to file)
|
|
38
|
+
|
|
39
|
+
Fetch titles and URLs of articles, and save them to a file without summarization:
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
# Save as JSON format (default)
|
|
43
|
+
$ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 articles.jsonl
|
|
44
|
+
|
|
45
|
+
# Save as bullet list
|
|
46
|
+
$ omae-douyo fetch https://nikkie-ftnext.hatenablog.com/archive/2023/4 titles.txt --as-title-list
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Development
|
|
50
|
+
|
|
51
|
+
### Sub commands
|
|
52
|
+
|
|
53
|
+
Fetch only (same as `omae-douyo fetch`):
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
python -m recent_state_summarizer.fetch -h
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Summarize only:
|
|
60
|
+
It's convenient to omit fetching in tuning the prompt.
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
python -m recent_state_summarizer.summarize -h
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Environment
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
$ git clone https://github.com/ftnext/recent-state-summarizer.git
|
|
70
|
+
$ cd recent-state-summarizer
|
|
71
|
+
|
|
72
|
+
$ python -m venv venv
|
|
73
|
+
$ source venv/bin/activate
|
|
74
|
+
(venv) $ pip install -r requirements.lock
|
|
75
|
+
(venv) $ pip install -e '.'
|
|
76
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "recent-state-summarizer"
|
|
7
|
+
authors = [
|
|
8
|
+
{name = "nikkie", email = "takuyafjp+develop@gmail.com"},
|
|
9
|
+
]
|
|
10
|
+
description = "Summarize a list of entry titles using LLM"
|
|
11
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
license = {text = "MIT"}
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 1 - Planning",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3.14",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"beautifulsoup4",
|
|
27
|
+
"feedparser",
|
|
28
|
+
"httpx",
|
|
29
|
+
"openai<1",
|
|
30
|
+
]
|
|
31
|
+
dynamic = ["version"]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
testing = ["pytest", "responses", "respx"]
|
|
35
|
+
lint = ["flake8", "black", "isort"]
|
|
36
|
+
dev = ["wheel", "build", "twine"]
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
omae-douyo = "recent_state_summarizer.__main__:main"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
exclude = ["tests"]
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.dynamic]
|
|
45
|
+
version = {attr = "recent_state_summarizer.__version__"}
|
|
46
|
+
|
|
47
|
+
[tool.pytest]
|
|
48
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.8"
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
from textwrap import dedent
|
|
5
|
+
|
|
6
|
+
from recent_state_summarizer.fetch import _main as fetch_main
|
|
7
|
+
from recent_state_summarizer.fetch import build_parser as build_fetch_parser
|
|
8
|
+
from recent_state_summarizer.summarize import summarize_titles
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_parser():
|
|
12
|
+
help_message = """
|
|
13
|
+
Summarize blog article titles with the OpenAI API.
|
|
14
|
+
|
|
15
|
+
⚠️ Set `OPENAI_API_KEY` environment variable.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
omae-douyo https://awesome.hatenablog.com/archive/2023
|
|
19
|
+
|
|
20
|
+
Retrieve the titles of articles from a specified URL.
|
|
21
|
+
After summarization, prints the summary.
|
|
22
|
+
"""
|
|
23
|
+
parser = argparse.ArgumentParser(
|
|
24
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
25
|
+
description=dedent(help_message),
|
|
26
|
+
)
|
|
27
|
+
subparsers = parser.add_subparsers(dest="subcommand")
|
|
28
|
+
|
|
29
|
+
run_parser = subparsers.add_parser(
|
|
30
|
+
"run", help="Fetch article titles and generate summary (default)"
|
|
31
|
+
)
|
|
32
|
+
run_parser.add_argument("url", help="URL of archive page")
|
|
33
|
+
run_parser.set_defaults(func=run_cli)
|
|
34
|
+
|
|
35
|
+
fetch_help_message = """
|
|
36
|
+
Retrieve the titles and URLs of articles from a web page specified by URL
|
|
37
|
+
and save them as JSON Lines format.
|
|
38
|
+
|
|
39
|
+
Support:
|
|
40
|
+
- はてなブログ(Hatena blog)
|
|
41
|
+
- はてなブックマークRSS
|
|
42
|
+
- Adventar
|
|
43
|
+
- Qiita Advent Calendar
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
omae-douyo fetch https://awesome.hatenablog.com/archive/2023 articles.jsonl
|
|
47
|
+
"""
|
|
48
|
+
fetch_parser = subparsers.add_parser(
|
|
49
|
+
"fetch",
|
|
50
|
+
parents=[build_fetch_parser(add_help=False)],
|
|
51
|
+
help="Fetch article titles only and save to file",
|
|
52
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
53
|
+
description=dedent(fetch_help_message),
|
|
54
|
+
)
|
|
55
|
+
fetch_parser.set_defaults(func=fetch_cli)
|
|
56
|
+
|
|
57
|
+
return parser
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def run_cli(args):
|
|
61
|
+
with tempfile.NamedTemporaryFile(mode="w+") as tempf:
|
|
62
|
+
fetch_main(args.url, tempf.name, save_as_title_list=True)
|
|
63
|
+
tempf.seek(0)
|
|
64
|
+
titles = tempf.read()
|
|
65
|
+
summary = summarize_titles(titles)
|
|
66
|
+
print(summary)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def fetch_cli(args):
|
|
70
|
+
fetch_main(args.url, args.save_path, save_as_title_list=args.as_title_list)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def normalize_argv() -> list[str]:
|
|
74
|
+
argv = sys.argv[1:]
|
|
75
|
+
if len(argv) == 0:
|
|
76
|
+
return ["--help"]
|
|
77
|
+
|
|
78
|
+
help_flags = {"-h", "--help"}
|
|
79
|
+
if argv[0] in help_flags:
|
|
80
|
+
return argv
|
|
81
|
+
|
|
82
|
+
known_subcommands = {"run", "fetch"}
|
|
83
|
+
if argv[0] not in known_subcommands:
|
|
84
|
+
return ["run"] + argv
|
|
85
|
+
|
|
86
|
+
return argv
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def main():
|
|
90
|
+
parser = build_parser()
|
|
91
|
+
argv = normalize_argv()
|
|
92
|
+
args = parser.parse_args(argv)
|
|
93
|
+
args.func(args)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import textwrap
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
from recent_state_summarizer.fetch.adventar import (
|
|
13
|
+
TitleTag,
|
|
14
|
+
fetch_adventar_calendar,
|
|
15
|
+
)
|
|
16
|
+
from recent_state_summarizer.fetch.hatena_blog import _fetch_titles
|
|
17
|
+
from recent_state_summarizer.fetch.hatena_bookmark import (
|
|
18
|
+
fetch_hatena_bookmark_rss,
|
|
19
|
+
)
|
|
20
|
+
from recent_state_summarizer.fetch.qiita_advent_calendar import (
|
|
21
|
+
fetch_qiita_advent_calendar,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class URLType(Enum):
|
|
28
|
+
"""Type of URL for fetching."""
|
|
29
|
+
|
|
30
|
+
HATENA_BLOG = "hatena_blog"
|
|
31
|
+
HATENA_BOOKMARK_RSS = "hatena_bookmark_rss"
|
|
32
|
+
ADVENTAR = "adventar"
|
|
33
|
+
QIITA_ADVENT_CALENDAR = "qiita_advent_calendar"
|
|
34
|
+
UNKNOWN = "unknown"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _detect_url_type(url: str) -> URLType:
|
|
38
|
+
"""Detect the type of URL to determine fetch strategy.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url: URL to analyze
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
URLType indicating the fetch strategy to use
|
|
45
|
+
"""
|
|
46
|
+
parsed = urlparse(url)
|
|
47
|
+
if (
|
|
48
|
+
parsed.netloc == "b.hatena.ne.jp"
|
|
49
|
+
and parsed.path.startswith("/entrylist/")
|
|
50
|
+
and parsed.path.endswith(".rss")
|
|
51
|
+
):
|
|
52
|
+
return URLType.HATENA_BOOKMARK_RSS
|
|
53
|
+
|
|
54
|
+
if parsed.netloc == "qiita.com" and "/advent-calendar/" in parsed.path:
|
|
55
|
+
return URLType.QIITA_ADVENT_CALENDAR
|
|
56
|
+
|
|
57
|
+
if "/calendars/" in parsed.path or "adventar.org" in parsed.netloc:
|
|
58
|
+
return URLType.ADVENTAR
|
|
59
|
+
|
|
60
|
+
if (
|
|
61
|
+
"hatenablog.com" in url
|
|
62
|
+
or "hateblo.jp" in url
|
|
63
|
+
or "/archive/" in parsed.path
|
|
64
|
+
):
|
|
65
|
+
return URLType.HATENA_BLOG
|
|
66
|
+
|
|
67
|
+
return URLType.UNKNOWN
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _select_fetcher(url_type):
|
|
71
|
+
match url_type:
|
|
72
|
+
case URLType.HATENA_BOOKMARK_RSS:
|
|
73
|
+
return fetch_hatena_bookmark_rss
|
|
74
|
+
case URLType.HATENA_BLOG:
|
|
75
|
+
return _fetch_titles
|
|
76
|
+
case URLType.ADVENTAR:
|
|
77
|
+
return fetch_adventar_calendar
|
|
78
|
+
case URLType.QIITA_ADVENT_CALENDAR:
|
|
79
|
+
return fetch_qiita_advent_calendar
|
|
80
|
+
case _:
|
|
81
|
+
raise ValueError(f"Unsupported URL type: {url_type}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _main(
|
|
85
|
+
url: str, save_path: str | Path, *, save_as_title_list: bool
|
|
86
|
+
) -> None:
|
|
87
|
+
url_type = _detect_url_type(url)
|
|
88
|
+
fetcher = _select_fetcher(url_type)
|
|
89
|
+
title_tags = fetcher(url)
|
|
90
|
+
if save_as_title_list:
|
|
91
|
+
contents = _as_bullet_list(
|
|
92
|
+
title_tag["title"] for title_tag in title_tags
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
contents = _as_json(title_tags)
|
|
96
|
+
_save(save_path, contents)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _as_bullet_list(titles: Iterable[str]) -> str:
|
|
100
|
+
return "\n".join(f"- {title}" for title in titles)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _as_json(title_tags: Iterable[TitleTag]) -> str:
|
|
104
|
+
return "\n".join(
|
|
105
|
+
json.dumps(title_tag, ensure_ascii=False) for title_tag in title_tags
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _save(path: str | Path, contents: str) -> None:
|
|
110
|
+
with open(path, "w", encoding="utf8", newline="") as f:
|
|
111
|
+
f.write(contents)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def build_parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
115
|
+
help_message = """
|
|
116
|
+
Retrieve the titles and URLs of articles from a web page specified by URL
|
|
117
|
+
and save them as JSON Lines format.
|
|
118
|
+
|
|
119
|
+
Support:
|
|
120
|
+
- はてなブログ(Hatena blog)
|
|
121
|
+
- はてなブックマークRSS
|
|
122
|
+
- Adventar
|
|
123
|
+
- Qiita Advent Calendar
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
python -m recent_state_summarizer.fetch \\
|
|
127
|
+
https://awesome.hatenablog.com/archive/2023 articles.jsonl
|
|
128
|
+
"""
|
|
129
|
+
parser = argparse.ArgumentParser(
|
|
130
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
131
|
+
description=textwrap.dedent(help_message),
|
|
132
|
+
add_help=add_help,
|
|
133
|
+
)
|
|
134
|
+
parser.add_argument("url", help="URL of archive page")
|
|
135
|
+
parser.add_argument("save_path", help="Local file path")
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--as-title-list",
|
|
138
|
+
action="store_true",
|
|
139
|
+
default=False,
|
|
140
|
+
help="Save as title-only bullet list instead of JSON Lines",
|
|
141
|
+
)
|
|
142
|
+
return parser
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def cli():
|
|
146
|
+
parser = build_parser()
|
|
147
|
+
args = parser.parse_args()
|
|
148
|
+
|
|
149
|
+
_main(args.url, args.save_path, save_as_title_list=args.as_title_list)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
cli()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from typing import TypedDict
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TitleTag(TypedDict):
|
|
9
|
+
title: str
|
|
10
|
+
url: str
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _fetch(url: str) -> str:
|
|
14
|
+
response = httpx.get(url)
|
|
15
|
+
response.raise_for_status()
|
|
16
|
+
return response.text
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def fetch_adventar_calendar(url: str) -> Generator[TitleTag, None, None]:
|
|
20
|
+
"""Fetch article titles and URLs from Adventar calendar.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
url: Adventar calendar URL (e.g., https://adventar.org/calendars/11474)
|
|
24
|
+
|
|
25
|
+
Yields:
|
|
26
|
+
TitleTag dictionaries containing title and url
|
|
27
|
+
"""
|
|
28
|
+
raw_html = _fetch(url)
|
|
29
|
+
yield from _parse_titles(raw_html)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
|
|
33
|
+
"""Parse titles from Adventar calendar HTML."""
|
|
34
|
+
soup = BeautifulSoup(raw_html, "html.parser")
|
|
35
|
+
entry_list = soup.find("ul", class_="EntryList")
|
|
36
|
+
if not entry_list:
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
items = entry_list.find_all("li", class_="item")
|
|
40
|
+
for item in items:
|
|
41
|
+
article = item.find("div", class_="article")
|
|
42
|
+
if not article:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
link_div = article.find("div", class_="link")
|
|
46
|
+
if not link_div:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
link = link_div.find("a")
|
|
50
|
+
if not link or "href" not in link.attrs:
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
title_div = link_div.find_next_sibling("div")
|
|
54
|
+
if title_div and title_div.text.strip():
|
|
55
|
+
title = title_div.text.strip()
|
|
56
|
+
else:
|
|
57
|
+
title = link.text.strip()
|
|
58
|
+
|
|
59
|
+
yield {"title": title, "url": link["href"]}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from typing import TypedDict
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
PARSE_HATENABLOG_KWARGS = {"name": "a", "attrs": {"class": "entry-title-link"}}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TitleTag(TypedDict):
|
|
11
|
+
title: str
|
|
12
|
+
url: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _fetch(url: str) -> str:
|
|
16
|
+
with httpx.Client() as client:
|
|
17
|
+
response = client.get(url)
|
|
18
|
+
response.raise_for_status()
|
|
19
|
+
return response.text
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _fetch_titles(url: str) -> Generator[TitleTag, None, None]:
|
|
23
|
+
raw_html = _fetch(url)
|
|
24
|
+
yield from _parse_titles(raw_html)
|
|
25
|
+
|
|
26
|
+
soup = BeautifulSoup(raw_html, "html.parser")
|
|
27
|
+
next_link = soup.find("a", class_="test-pager-next")
|
|
28
|
+
if next_link and "href" in next_link.attrs:
|
|
29
|
+
next_url = next_link["href"]
|
|
30
|
+
print(f"Next page found, fetching... {next_url}")
|
|
31
|
+
yield from _fetch_titles(next_url)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
|
|
35
|
+
soup = BeautifulSoup(raw_html, "html.parser")
|
|
36
|
+
body = soup.body
|
|
37
|
+
title_tags = body.find_all(**PARSE_HATENABLOG_KWARGS)
|
|
38
|
+
for title_tag in title_tags:
|
|
39
|
+
yield {"title": title_tag.text, "url": title_tag["href"]}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Generator, TypedDict
|
|
2
|
+
|
|
3
|
+
import feedparser
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BookmarkEntry(TypedDict):
|
|
8
|
+
title: str
|
|
9
|
+
url: str
|
|
10
|
+
description: str
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def fetch_hatena_bookmark_rss(
|
|
14
|
+
url: str,
|
|
15
|
+
) -> Generator[BookmarkEntry, None, None]:
|
|
16
|
+
"""Fetch entries from Hatena Bookmark RSS feed.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
url: URL of the Hatena Bookmark RSS feed
|
|
20
|
+
|
|
21
|
+
Yields:
|
|
22
|
+
Bookmark entries with title, url, and description
|
|
23
|
+
"""
|
|
24
|
+
response = httpx.get(url)
|
|
25
|
+
response.raise_for_status()
|
|
26
|
+
|
|
27
|
+
feed = feedparser.parse(response.content)
|
|
28
|
+
|
|
29
|
+
for entry in feed.entries:
|
|
30
|
+
yield {
|
|
31
|
+
"title": entry.title,
|
|
32
|
+
"url": entry.link,
|
|
33
|
+
"description": entry.description,
|
|
34
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections.abc import Generator
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TitleTag(TypedDict):
|
|
10
|
+
title: str
|
|
11
|
+
url: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _fetch(url: str) -> str:
|
|
15
|
+
response = httpx.get(url)
|
|
16
|
+
response.raise_for_status()
|
|
17
|
+
return response.text
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def fetch_qiita_advent_calendar(url: str) -> Generator[TitleTag, None, None]:
|
|
21
|
+
"""Fetch article titles and URLs from Qiita Advent Calendar.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
url: Qiita Advent Calendar URL (e.g., https://qiita.com/advent-calendar/2025/python-type-hints)
|
|
25
|
+
|
|
26
|
+
Yields:
|
|
27
|
+
TitleTag dictionaries containing title and url
|
|
28
|
+
"""
|
|
29
|
+
raw_html = _fetch(url)
|
|
30
|
+
yield from _parse_titles(raw_html)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
|
|
34
|
+
"""Parse titles from Qiita Advent Calendar HTML by extracting JSON data."""
|
|
35
|
+
soup = BeautifulSoup(raw_html, "html.parser")
|
|
36
|
+
script_tag = soup.find(
|
|
37
|
+
"script",
|
|
38
|
+
attrs={"data-js-react-on-rails-store": "AppStoreWithReactOnRails"},
|
|
39
|
+
)
|
|
40
|
+
if not script_tag or not script_tag.string:
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
data = json.loads(script_tag.string)
|
|
44
|
+
advent_calendars = data.get("adventCalendars", {})
|
|
45
|
+
table_calendars = advent_calendars.get("tableAdventCalendars", [])
|
|
46
|
+
if not table_calendars:
|
|
47
|
+
return
|
|
48
|
+
items = table_calendars[0].get("items", [])
|
|
49
|
+
|
|
50
|
+
for item in items:
|
|
51
|
+
if not item.get("isRevealed", False):
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
title = item.get("comment")
|
|
55
|
+
article_url = item.get("url")
|
|
56
|
+
|
|
57
|
+
if title and article_url:
|
|
58
|
+
yield {"title": title, "url": article_url}
|