recent-state-summarizer 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recent-state-summarizer might be problematic. Click here for more details.

Files changed (18) hide show
  1. {recent-state-summarizer-0.0.1/recent_state_summarizer.egg-info → recent_state_summarizer-0.0.3}/PKG-INFO +17 -5
  2. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/pyproject.toml +5 -4
  3. recent_state_summarizer-0.0.3/recent_state_summarizer/__init__.py +1 -0
  4. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer/fetch.py +37 -9
  5. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer/summarize.py +5 -3
  6. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3/recent_state_summarizer.egg-info}/PKG-INFO +17 -5
  7. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer.egg-info/SOURCES.txt +2 -1
  8. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer.egg-info/requires.txt +2 -1
  9. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer.egg-info/top_level.txt +0 -1
  10. recent_state_summarizer-0.0.3/tests/test_fetch.py +150 -0
  11. recent-state-summarizer-0.0.1/recent_state_summarizer/__init__.py +0 -1
  12. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/LICENSE +0 -0
  13. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/README.md +0 -0
  14. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer/__main__.py +0 -0
  15. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer.egg-info/dependency_links.txt +0 -0
  16. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/recent_state_summarizer.egg-info/entry_points.txt +0 -0
  17. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/setup.cfg +0 -0
  18. {recent-state-summarizer-0.0.1 → recent_state_summarizer-0.0.3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: recent-state-summarizer
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Summarize a list of entry titles using LLM
5
5
  Author-email: nikkie <takuyafjp+develop@gmail.com>
6
6
  License: MIT
@@ -8,16 +8,28 @@ Classifier: Development Status :: 1 - Planning
8
8
  Classifier: License :: OSI Approved :: MIT License
9
9
  Classifier: Programming Language :: Python
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.8
12
11
  Classifier: Programming Language :: Python :: 3.9
13
12
  Classifier: Programming Language :: Python :: 3.10
14
13
  Classifier: Programming Language :: Python :: 3.11
15
- Requires-Python: >=3.8
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.9
16
17
  Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: beautifulsoup4
20
+ Requires-Dist: openai<1
17
21
  Provides-Extra: testing
22
+ Requires-Dist: pytest; extra == "testing"
23
+ Requires-Dist: pytest_httpserver; extra == "testing"
18
24
  Provides-Extra: lint
25
+ Requires-Dist: flake8; extra == "lint"
26
+ Requires-Dist: black; extra == "lint"
27
+ Requires-Dist: isort; extra == "lint"
19
28
  Provides-Extra: dev
20
- License-File: LICENSE
29
+ Requires-Dist: wheel; extra == "dev"
30
+ Requires-Dist: build; extra == "dev"
31
+ Requires-Dist: twine; extra == "dev"
32
+ Dynamic: license-file
21
33
 
22
34
  # recent-state-summarizer
23
35
 
@@ -9,26 +9,27 @@ authors = [
9
9
  ]
10
10
  description = "Summarize a list of entry titles using LLM"
11
11
  readme = {file = "README.md", content-type = "text/markdown"}
12
- requires-python = ">=3.8"
12
+ requires-python = ">=3.9"
13
13
  license = {text = "MIT"}
14
14
  classifiers = [
15
15
  "Development Status :: 1 - Planning",
16
16
  "License :: OSI Approved :: MIT License",
17
17
  "Programming Language :: Python",
18
18
  "Programming Language :: Python :: 3",
19
- "Programming Language :: Python :: 3.8",
20
19
  "Programming Language :: Python :: 3.9",
21
20
  "Programming Language :: Python :: 3.10",
22
21
  "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
23
24
  ]
24
25
  dependencies = [
25
26
  "beautifulsoup4",
26
- "openai",
27
+ "openai<1",
27
28
  ]
28
29
  dynamic = ["version"]
29
30
 
30
31
  [project.optional-dependencies]
31
- testing = ["pytest"]
32
+ testing = ["pytest", "pytest_httpserver"]
32
33
  lint = ["flake8", "black", "isort"]
33
34
  dev = ["wheel", "build", "twine"]
34
35
 
@@ -0,0 +1 @@
1
+ __version__ = "0.0.3"
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  from collections.abc import Generator, Iterable
4
5
  from pathlib import Path
6
+ from typing import TypedDict
5
7
  from urllib.request import urlopen
6
8
 
7
9
  from bs4 import BeautifulSoup
@@ -9,37 +11,57 @@ from bs4 import BeautifulSoup
9
11
  PARSE_HATENABLOG_KWARGS = {"name": "a", "attrs": {"class": "entry-title-link"}}
10
12
 
11
13
 
12
- def _main(url: str, save_path: str | Path) -> None:
13
- contents = fetch_titles_as_bullet_list(url)
14
- _save(save_path, contents)
14
+ class TitleTag(TypedDict):
15
+ title: str
16
+ url: str
15
17
 
16
18
 
17
- def fetch_titles_as_bullet_list(url: str) -> str:
18
- return _as_bullet_list(_fetch_titles(url))
19
+ def _main(url: str, save_path: str | Path, save_as_json: bool) -> None:
20
+ title_tags = _fetch_titles(url)
21
+ if not save_as_json:
22
+ contents = _as_bullet_list(
23
+ title_tag["title"] for title_tag in title_tags
24
+ )
25
+ else:
26
+ contents = _as_json(title_tags)
27
+ _save(save_path, contents)
19
28
 
20
29
 
21
- def _fetch_titles(url: str) -> Generator[str, None, None]:
30
+ def _fetch_titles(url: str) -> Generator[TitleTag, None, None]:
22
31
  raw_html = _fetch(url)
23
32
  yield from _parse_titles(raw_html)
24
33
 
34
+ soup = BeautifulSoup(raw_html, "html.parser")
35
+ next_link = soup.find("a", class_="test-pager-next")
36
+ if next_link and "href" in next_link.attrs:
37
+ next_url = next_link["href"]
38
+ print(f"Next page found, fetching... {next_url}")
39
+ yield from _fetch_titles(next_url)
40
+
25
41
 
26
42
  def _fetch(url: str) -> str:
27
43
  with urlopen(url) as res:
28
44
  return res.read()
29
45
 
30
46
 
31
- def _parse_titles(raw_html: str) -> Generator[str, None, None]:
47
+ def _parse_titles(raw_html: str) -> Generator[TitleTag, None, None]:
32
48
  soup = BeautifulSoup(raw_html, "html.parser")
33
49
  body = soup.body
34
50
  title_tags = body.find_all(**PARSE_HATENABLOG_KWARGS)
35
51
  for title_tag in title_tags:
36
- yield title_tag.text
52
+ yield {"title": title_tag.text, "url": title_tag["href"]}
37
53
 
38
54
 
39
55
  def _as_bullet_list(titles: Iterable[str]) -> str:
40
56
  return "\n".join(f"- {title}" for title in titles)
41
57
 
42
58
 
59
+ def _as_json(title_tags: Iterable[TitleTag]) -> str:
60
+ return "\n".join(
61
+ json.dumps(title_tag, ensure_ascii=False) for title_tag in title_tags
62
+ )
63
+
64
+
43
65
  def _save(path: str | Path, contents: str) -> None:
44
66
  with open(path, "w", encoding="utf8", newline="") as f:
45
67
  f.write(contents)
@@ -66,6 +88,12 @@ if __name__ == "__main__":
66
88
  )
67
89
  parser.add_argument("url", help="URL of archive page")
68
90
  parser.add_argument("save_path", help="Local file path")
91
+ parser.add_argument(
92
+ "--as-json",
93
+ action="store_true",
94
+ default=False,
95
+ help="Save as JSON format instead of bullet list",
96
+ )
69
97
  args = parser.parse_args()
70
98
 
71
- _main(args.url, args.save_path)
99
+ _main(args.url, args.save_path, args.as_json)
@@ -25,17 +25,19 @@ def _build_prompts(titles: str):
25
25
 
26
26
  def _build_summarize_prompt_text(titles_as_list: str) -> str:
27
27
  return f"""\
28
- 以下は同一人物が最近書いたブログ記事のタイトルの一覧です。
28
+ 3つのバッククォートで囲まれた以下は、同一人物が最近書いたブログ記事のタイトルの一覧です。
29
29
  それを読み、この人物が最近何をやっているかを詳しく教えてください。
30
30
  応答は文ごとに改行して区切ってください。
31
31
 
32
+ ```
32
33
  {titles_as_list}
34
+ ```
33
35
  """
34
36
 
35
37
 
36
- def _complete_chat(prompts):
38
+ def _complete_chat(prompts, temperature=0.0):
37
39
  return openai.ChatCompletion.create(
38
- model=MODEL, messages=prompts, temperature=0.8
40
+ model=MODEL, messages=prompts, temperature=temperature
39
41
  )
40
42
 
41
43
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: recent-state-summarizer
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Summarize a list of entry titles using LLM
5
5
  Author-email: nikkie <takuyafjp+develop@gmail.com>
6
6
  License: MIT
@@ -8,16 +8,28 @@ Classifier: Development Status :: 1 - Planning
8
8
  Classifier: License :: OSI Approved :: MIT License
9
9
  Classifier: Programming Language :: Python
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.8
12
11
  Classifier: Programming Language :: Python :: 3.9
13
12
  Classifier: Programming Language :: Python :: 3.10
14
13
  Classifier: Programming Language :: Python :: 3.11
15
- Requires-Python: >=3.8
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.9
16
17
  Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: beautifulsoup4
20
+ Requires-Dist: openai<1
17
21
  Provides-Extra: testing
22
+ Requires-Dist: pytest; extra == "testing"
23
+ Requires-Dist: pytest_httpserver; extra == "testing"
18
24
  Provides-Extra: lint
25
+ Requires-Dist: flake8; extra == "lint"
26
+ Requires-Dist: black; extra == "lint"
27
+ Requires-Dist: isort; extra == "lint"
19
28
  Provides-Extra: dev
20
- License-File: LICENSE
29
+ Requires-Dist: wheel; extra == "dev"
30
+ Requires-Dist: build; extra == "dev"
31
+ Requires-Dist: twine; extra == "dev"
32
+ Dynamic: license-file
21
33
 
22
34
  # recent-state-summarizer
23
35
 
@@ -11,4 +11,5 @@ recent_state_summarizer.egg-info/SOURCES.txt
11
11
  recent_state_summarizer.egg-info/dependency_links.txt
12
12
  recent_state_summarizer.egg-info/entry_points.txt
13
13
  recent_state_summarizer.egg-info/requires.txt
14
- recent_state_summarizer.egg-info/top_level.txt
14
+ recent_state_summarizer.egg-info/top_level.txt
15
+ tests/test_fetch.py
@@ -1,5 +1,5 @@
1
1
  beautifulsoup4
2
- openai
2
+ openai<1
3
3
 
4
4
  [dev]
5
5
  wheel
@@ -13,3 +13,4 @@ isort
13
13
 
14
14
  [testing]
15
15
  pytest
16
+ pytest_httpserver
@@ -0,0 +1,150 @@
1
+ import pytest
2
+
3
+ from recent_state_summarizer.fetch import _main
4
+
5
+
6
+ @pytest.fixture
7
+ def blog_server(httpserver):
8
+ httpserver.expect_request("/archive/2025/06").respond_with_data(
9
+ f"""\
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head><title>Archive</title></head>
13
+ <body>
14
+ <h1>Archive</h1>
15
+ <div id="content">
16
+ <div id="content-inner">
17
+ <div id="wrapper">
18
+ <div id="main">
19
+ <div id="main-inner">
20
+ <div class="archive-entries">
21
+ <section class="archive-entry">
22
+ <a class="entry-title-link" href="{httpserver.url_for('/')}archive/2025/06/03">Title 3</a>
23
+ </section>
24
+ <section class="archive-entry">
25
+ <a class="entry-title-link" href="{httpserver.url_for('/')}archive/2025/06/02">Title 2</a>
26
+ </section>
27
+ <section class="archive-entry">
28
+ <a class="entry-title-link" href="{httpserver.url_for('/')}archive/2025/06/01">Title 1</a>
29
+ </section>
30
+ </div>
31
+ </div>
32
+ </div>
33
+ </div>
34
+ </div>
35
+ </div>
36
+ </body>
37
+ </html>"""
38
+ )
39
+ return httpserver
40
+
41
+
42
+ def test_fetch_as_bullet_list(blog_server, tmp_path):
43
+ _main(
44
+ blog_server.url_for("/archive/2025/06"),
45
+ tmp_path / "titles.txt",
46
+ save_as_json=False,
47
+ )
48
+
49
+ expected = """\
50
+ - Title 3
51
+ - Title 2
52
+ - Title 1"""
53
+ assert (tmp_path / "titles.txt").read_text(encoding="utf8") == expected
54
+
55
+
56
+ def test_fetch_as_json(blog_server, tmp_path):
57
+ _main(
58
+ blog_server.url_for("/archive/2025/06"),
59
+ tmp_path / "titles.json",
60
+ save_as_json=True,
61
+ )
62
+
63
+ expected = f"""\
64
+ {{"title": "Title 3", "url": "{blog_server.url_for('/archive/2025/06/03')}"}}
65
+ {{"title": "Title 2", "url": "{blog_server.url_for('/archive/2025/06/02')}"}}
66
+ {{"title": "Title 1", "url": "{blog_server.url_for('/archive/2025/06/01')}"}}"""
67
+ assert (tmp_path / "titles.json").read_text(encoding="utf8") == expected
68
+
69
+
70
+ @pytest.fixture
71
+ def multi_page_blog_server(httpserver):
72
+ httpserver.expect_request(
73
+ "/archive/2025/07", query_string="page=2"
74
+ ).respond_with_data(
75
+ f"""\
76
+ <!DOCTYPE html>
77
+ <html>
78
+ <head><title>Archive (Page 2)</title></head>
79
+ <body>
80
+ <h1>Archive</h1>
81
+ <div id="content">
82
+ <div id="content-inner">
83
+ <div id="wrapper">
84
+ <div id="main">
85
+ <div id="main-inner">
86
+ <div class="archive-entries">
87
+ <section class="archive-entry">
88
+ <a class="entry-title-link" href="{httpserver.url_for('/')}archive/2025/07/01">Title 1</a>
89
+ </section>
90
+ </div>
91
+ <div class="pager">
92
+ <span class="pager-prev">
93
+ <a href="{httpserver.url_for('/')}archive/2025/07" class="test-pager-prev" rel="prev">前のページ</a>
94
+ </span>
95
+ </div>
96
+ </div>
97
+ </div>
98
+ </div>
99
+ </div>
100
+ </div>
101
+ </body>
102
+ </html>"""
103
+ )
104
+ httpserver.expect_request("/archive/2025/07").respond_with_data(
105
+ f"""\
106
+ <!DOCTYPE html>
107
+ <html>
108
+ <head><title>Archive</title></head>
109
+ <body>
110
+ <h1>Archive</h1>
111
+ <div id="content">
112
+ <div id="content-inner">
113
+ <div id="wrapper">
114
+ <div id="main">
115
+ <div id="main-inner">
116
+ <div class="archive-entries">
117
+ <section class="archive-entry">
118
+ <a class="entry-title-link" href="{httpserver.url_for('/')}archive/2025/07/03">Title 3</a>
119
+ </section>
120
+ <section class="archive-entry">
121
+ <a class="entry-title-link" href="{httpserver.url_for('/')}archive/2025/07/02">Title 2</a>
122
+ </section>
123
+ </div>
124
+ </div>
125
+ <div class="pager">
126
+ <span class="pager-next">
127
+ <a href="{httpserver.url_for('/')}archive/2025/07?page=2" class="test-pager-next" rel="next">次のページ</a>
128
+ </span>
129
+ </div>
130
+ </div>
131
+ </div>
132
+ </div>
133
+ </div>
134
+ </body>
135
+ </html>"""
136
+ )
137
+ return httpserver
138
+
139
+
140
+ def test_fetch_multiple_archive_page(multi_page_blog_server, tmp_path):
141
+ _main(
142
+ multi_page_blog_server.url_for("/archive/2025/07"),
143
+ tmp_path / "titles.txt",
144
+ save_as_json=False,
145
+ )
146
+
147
+ expected = """- Title 3
148
+ - Title 2
149
+ - Title 1"""
150
+ assert (tmp_path / "titles.txt").read_text(encoding="utf8") == expected
@@ -1 +0,0 @@
1
- __version__ = "0.0.1"