pynamubot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.3
2
+ Name: pynamubot
3
+ Version: 0.1.0
4
+ Summary: Python client library for TheSeed-based wiki APIs such as NamuWiki.
5
+ Keywords: namuwiki,theseed,wiki,api,bot
6
+ Author: Iodine at NamuWiki
7
+ License: MIT
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Internet :: WWW/HTTP
18
+ Classifier: Typing :: Typed
19
+ Requires-Dist: lark>=1.3.1
20
+ Requires-Dist: pydantic>=2.12.5
21
+ Requires-Dist: requests>=2.32.5
22
+ Requires-Dist: structlog>=25.5.0
23
+ Requires-Dist: typing-extensions>=4.9.0
24
+ Requires-Dist: selenium>=4.0.0 ; extra == 'puppet'
25
+ Requires-Python: >=3.9
26
+ Provides-Extra: puppet
27
+ Description-Content-Type: text/markdown
28
+
29
+ # pynamubot
30
+
31
+ Python client library for TheSeed-based wiki APIs (for example, NamuWiki).
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install pynamubot
37
+ ```
38
+
39
+ If you need Selenium-based utilities:
40
+
41
+ ```bash
42
+ pip install "pynamubot[puppet]"
43
+ ```
44
+
45
+ ## Quick start
46
+
47
+ ```python
48
+ from pynamubot.api import TheSeedAPIClient
49
+
50
+ client = TheSeedAPIClient(
51
+ base_url="https://namu.wiki/api",
52
+ api_token="YOUR_API_TOKEN",
53
+ )
54
+
55
+ response = client.edit_get("TestDocument")
56
+ print(response.exists, response.token)
57
+ ```
58
+
59
+ ## Requirements
60
+
61
+ - Python 3.9+
62
+ - A valid API token issued by the target wiki
63
+
64
+ ## Reference
65
+
66
+ - TheSeed API docs: <https://doc.theseed.io/>
@@ -0,0 +1,38 @@
1
+ # pynamubot
2
+
3
+ Python client library for TheSeed-based wiki APIs (for example, NamuWiki).
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install pynamubot
9
+ ```
10
+
11
+ If you need Selenium-based utilities:
12
+
13
+ ```bash
14
+ pip install "pynamubot[puppet]"
15
+ ```
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from pynamubot.api import TheSeedAPIClient
21
+
22
+ client = TheSeedAPIClient(
23
+ base_url="https://namu.wiki/api",
24
+ api_token="YOUR_API_TOKEN",
25
+ )
26
+
27
+ response = client.edit_get("TestDocument")
28
+ print(response.exists, response.token)
29
+ ```
30
+
31
+ ## Requirements
32
+
33
+ - Python 3.9+
34
+ - A valid API token issued by the target wiki
35
+
36
+ ## Reference
37
+
38
+ - TheSeed API docs: <https://doc.theseed.io/>
@@ -0,0 +1,38 @@
1
+ [project]
2
+ name = "pynamubot"
3
+ version = "0.1.0"
4
+ description = "Python client library for TheSeed-based wiki APIs such as NamuWiki."
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ license = { text = "MIT" }
8
+ authors = [
9
+ { name = "Iodine at NamuWiki" },
10
+ ]
11
+ keywords = ["namuwiki", "theseed", "wiki", "api", "bot"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3 :: Only",
18
+ "Programming Language :: Python :: 3.9",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Internet :: WWW/HTTP",
23
+ "Typing :: Typed",
24
+ ]
25
+ dependencies = [
26
+ "lark>=1.3.1",
27
+ "pydantic>=2.12.5",
28
+ "requests>=2.32.5",
29
+ "structlog>=25.5.0",
30
+ "typing-extensions>=4.9.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ puppet = ["selenium>=4.0.0"]
35
+
36
+ [build-system]
37
+ requires = ["uv_build>=0.9.7,<0.10.0"]
38
+ build-backend = "uv_build"
@@ -0,0 +1,4 @@
1
+ from .__version__ import __version__
2
+ from .api import TheSeedAPIClient
3
+
4
+ __all__ = ["TheSeedAPIClient", "__version__"]
@@ -0,0 +1,6 @@
1
+ __title__ = "pynamubot"
2
+ __description__ = "A Python client for the Namuwiki API."
3
+ __version__ = "0.1.0"
4
+ __author__ = "Iodine at NamuWiki"
5
+ __license__ = "MIT"
6
+ __copyright__ = "Copyright Iodine at NamuWiki"
@@ -0,0 +1,4 @@
1
+ from . import schemas
2
+ from .api import TheSeedAPIClient
3
+
4
+ __all__ = ["TheSeedAPIClient", "schemas"]
@@ -0,0 +1,172 @@
1
+ """
2
+ TheSeedAPI client for interacting with the API endpoints.
3
+ """
4
+
5
+ import time
6
+ from functools import cached_property
7
+ from types import TracebackType
8
+ from typing import Any, Optional
9
+
10
+ import requests
11
+ import structlog
12
+ from typing_extensions import Self
13
+
14
+ from ..__version__ import __title__, __version__
15
+ from ..api.schemas import *
16
+
17
+
18
+ class Limiter:
19
+ """
20
+ A simple rate limiter to enforce a minimum interval between requests.
21
+
22
+ Note that NamuWiki's soft rate limit is 1 request per second. Setting interval_seconds to 1.0 is recommended.
23
+ """
24
+
25
+ def __init__(self, interval_seconds: float) -> None:
26
+ """
27
+ Initialize the rate limiter.
28
+
29
+ :param interval_seconds: The minimum interval between requests in seconds. Setting this to 0 disables rate limiting by not calling sleep at all.
30
+ """
31
+ self.interval = float(interval_seconds)
32
+ if self.interval <= 0.0:
33
+ self.acquire = lambda: None
34
+ self.last = float("-inf")
35
+
36
+ def acquire(self) -> None:
37
+ """
38
+ Acquire the limiter before making a request. This will block if necessary to enforce the rate limit.
39
+ """
40
+ now = time.monotonic()
41
+ elapsed = now - self.last
42
+ wait = self.interval - elapsed
43
+ time.sleep(max(0.0, wait))
44
+ self.last = time.monotonic()
45
+
46
+ def __enter__(self) -> Self:
47
+ self.acquire()
48
+ return self
49
+
50
+ def __exit__(self, exc_type: Optional[type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]) -> None:
51
+ pass
52
+
53
+
54
+ class TheSeedAPIClient:
55
+ """
56
+ Client for interacting with TheSeedAPI endpoints.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ base_url: str,
62
+ api_token: str,
63
+ logger: Optional[structlog.stdlib.BoundLogger] = None,
64
+ limiter: Optional[Limiter] = None,
65
+ ) -> None:
66
+ """
67
+ Initialize TheSeedAPI with the API token and base URL.
68
+
69
+ :param base_url: The base URL for the API endpoints.
70
+ :param api_token: The API token string.
71
+ :param session: An optional requests session to use for the API calls. Defaults to None (creates a new session).
72
+ :param logger: An optional structlog logger. Defaults to None (creates a new logger).
73
+ :param limiter: An optional rate limiter. Defaults to None (no limiter).
74
+ """
75
+ self.base_url = base_url.rstrip("/")
76
+ self.api_token = api_token
77
+ headers = {
78
+ "Authorization": f"Bearer {self.api_token}",
79
+ "Content-Type": "application/json",
80
+ "User-Agent": self.user_agent,
81
+ }
82
+ self.session = requests.Session()
83
+ self.session.headers.update(headers)
84
+ self.logger = logger if logger is not None else structlog.get_logger(__name__)
85
+ self.limiter = limiter if limiter is not None else Limiter(0.0)
86
+
87
+ @cached_property
88
+ def user_agent(self) -> str:
89
+ """
90
+ Get the user agent string for the API client.
91
+
92
+ :return: The user agent string.
93
+ """
94
+ return f"{requests.utils.default_user_agent()} {__title__}/{__version__}"
95
+
96
+ def edit_get(self, document: str) -> EditGETResponse:
97
+ """
98
+ Fetch the content of the document.
99
+
100
+ :param document: The document to fetch.
101
+ :return response: A dictionary containing the document text, existence status, and edit token.
102
+ """
103
+ url = f"{self.base_url}/edit/{document}"
104
+ with self.limiter, self.session.get(url) as response:
105
+ response.raise_for_status()
106
+ return EditGETResponse.model_validate(response.json())
107
+
108
+ def edit_post(self, document: str, body: EditPOSTBody) -> EditPOSTResponse:
109
+ """
110
+ Edit the document with new text.
111
+
112
+ :param document: The document to edit.
113
+ :param body: A dictionary containing the new text, log message, and edit token.
114
+ :return response: A dictionary containing the revision number of the edit.
115
+ """
116
+ url = f"{self.base_url}/edit/{document}"
117
+ with self.limiter, self.session.post(url, json=body) as response:
118
+ response.raise_for_status()
119
+ return EditPOSTResponse.model_validate(response.json())
120
+
121
+ def backlink(
122
+ self,
123
+ document: str,
124
+ namespace: Optional[str] = None,
125
+ flag: Optional[int] = None,
126
+ fromm: Optional[str] = None,
127
+ until: Optional[str] = None,
128
+ ) -> BacklinkResponse:
129
+ """
130
+ Retrieve backlinks for the document.
131
+
132
+ Note: The behavior when both fromm and until are not None is not well-defined.
133
+
134
+ :param document: The document to retrieve backlinks for.
135
+ :param namespace: The namespace of documents to query.
136
+ :param flag: Filter on how document is linked to this document.
137
+ :param fromm: Paginate from this document (inclusive). Note the double 'm' to avoid Python keyword conflict.
138
+ :param until: Paginate until this document (inclusive).
139
+ :return response: A dictionary containing the namespaces, backlinks, and from/to information.
140
+ """
141
+ url = f"{self.base_url}/backlink/{document}"
142
+ params: dict[str, Any] = {
143
+ "namespace": namespace,
144
+ "flag": flag,
145
+ "from": fromm,
146
+ "until": until,
147
+ }
148
+
149
+ with self.limiter, self.session.get(url, params=params) as response:
150
+ response.raise_for_status()
151
+ return BacklinkResponse.model_validate(response.json())
152
+
153
+ def discuss(self, document: str) -> list[DiscussResponse]:
154
+ """
155
+ Fetch discussions on the document.
156
+
157
+ :param document: The document to fetch discussions for.
158
+ :return response: A list of dictionaries containing the slug, topic, updated date, and status of the discussions.
159
+ """
160
+ url = f"{self.base_url}/discuss/{document}"
161
+ with self.limiter, self.session.get(url) as response:
162
+ response.raise_for_status()
163
+ return [DiscussResponse.model_validate(item) for item in response.json()]
164
+
165
+ def __del__(self) -> None:
166
+ self.session.close()
167
+
168
+ def __enter__(self) -> Self:
169
+ return self
170
+
171
+ def __exit__(self, exc_type: Optional[type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]) -> None:
172
+ self.session.close()
@@ -0,0 +1,57 @@
1
+ """
2
+ Typed request and response schema definitions for TheSeedAPI.
3
+
4
+ This module defines multiple `TypedDict` classes that describe the
5
+ expected structure of JSON request bodies and response payloads
6
+ when interacting with TheSeedAPI using the `requests` library.
7
+
8
+ Each class is meant to serve as a static typing adi for improved
9
+ editor support, validation, and readability in API-related code.
10
+
11
+ See <https://doc.theseed.io/> for full API documentation.
12
+ """
13
+
14
+ from datetime import datetime
15
+ from typing import Literal, Optional
16
+ from pydantic import BaseModel, Field
17
+
18
+
19
+ class EditGETResponse(BaseModel):
20
+ text: str = Field(description="The content of the document")
21
+ exists: bool = Field(description="Whether the document exists")
22
+ token: str = Field(description="The edit token (will be used for edit POST request)")
23
+
24
+
25
+ class EditPOSTBody(BaseModel):
26
+ text: str = Field(description="The edited document text")
27
+ log: str = Field(description="Edit summary")
28
+ token: str = Field(description="The edit token (from edit GET request)")
29
+
30
+
31
+ class EditPOSTResponse(BaseModel):
32
+ status: str = Field(description="The status of the edit operation")
33
+ rev: int = Field(description="The edited revision")
34
+
35
+
36
+ class Namespaces(BaseModel):
37
+ namespace: str = Field(description="The namespace of the document")
38
+ count: int = Field(description="The number of documents in the namespace")
39
+
40
+
41
+ class Backlinks(BaseModel):
42
+ document: str = Field(description="The document linked to")
43
+ flags: str = Field(description="Linking flags")
44
+
45
+
46
+ class BacklinkResponse(BaseModel):
47
+ namespaces: list[Namespaces] = Field(description="List of namespaces")
48
+ backlinks: list[Backlinks] = Field(description="List of backlinks")
49
+ fromm: Optional[str] = Field(description="Starting point for the query")
50
+ until: Optional[str] = Field(description="Ending point for the query")
51
+
52
+
53
+ class DiscussResponse(BaseModel):
54
+ slug: str = Field(description="The discussion slug")
55
+ topic: str = Field(description="The discussion topic")
56
+ updated_date: datetime = Field(description="The last comment's Unix timestamp")
57
+ status: Literal["normal", "close", "pause"] = Field(description="The discussion status")
File without changes
@@ -0,0 +1,35 @@
1
+ from collections import OrderedDict
2
+ from typing import Any, Optional
3
+
4
+ import structlog
5
+
6
+ from ..api import TheSeedAPIClient
7
+
8
+
9
+ class JobModule:
10
+ client: TheSeedAPIClient
11
+
12
+ def __init__(self, name: Optional[str] = None, logger: Optional[structlog.stdlib.BoundLogger] = None) -> None:
13
+ self._modules: OrderedDict[str, "JobModule"] = OrderedDict()
14
+ self.name = name or self.__class__.__name__
15
+ self.logger = logger or structlog.get_logger(self.name)
16
+
17
+ def __setattr__(self, key: str, value: Any) -> None:
18
+ super().__setattr__(key, value)
19
+ if isinstance(value, JobModule):
20
+ name, module = key, value
21
+ self._modules[name] = module
22
+ if hasattr(self, "client"):
23
+ self._propagate_attribute(module, "client", self.client)
24
+
25
+ def _propagate_attribute(self, module: "JobModule", attr_name: str, attr_value: Any) -> None:
26
+ if not hasattr(module, attr_name):
27
+ setattr(module, attr_name, attr_value)
28
+ for sub_module in module._modules.values():
29
+ self._propagate_attribute(sub_module, attr_name, attr_value)
30
+
31
+ def add_module(self, name: str, module: "JobModule") -> None:
32
+ """Add a sub-module to this job module."""
33
+ self._modules[name] = module
34
+ if hasattr(self, "client"):
35
+ self._propagate_attribute(module, "client", self.client)
@@ -0,0 +1,420 @@
1
+ import re
2
+ import time
3
+ from typing import Any, Callable
4
+
5
+ from .. import specific
6
+ from ..api.api import TheSeedAPIClient
7
+ from ..api.schemas import *
8
+ from ..jobs.job_core import JobModule
9
+ from ..utils import josalib
10
+ from ..utils.misc import safe_div
11
+
12
+
13
+ def cleanup_backlinks(
14
+ client: TheSeedAPIClient,
15
+ halt_discuss_document: str,
16
+ pairs: list[tuple[str, str, josalib.JosaType]],
17
+ throttle_seconds: float = 1.0,
18
+ ) -> None:
19
+ job = BacklinkCleanupJob(
20
+ client=client,
21
+ halt_discuss_document=halt_discuss_document,
22
+ pairs=pairs,
23
+ throttle_seconds=throttle_seconds,
24
+ )
25
+ job.execute()
26
+
27
+
28
+ def compile_pattern(pairs: list[tuple[str, str, josalib.JosaType]]) -> re.Pattern[str]:
29
+ before_patterns = "|".join(rf"(?P<before_{i}>{re.escape(before)})" for i, (before, _, _) in enumerate(pairs))
30
+
31
+ pattern = rf"""
32
+ \[\[ # [[ (start of link)
33
+ \s* # optional whitespace
34
+ (?:{before_patterns}) # one of the before patterns
35
+ \s* # optional whitespace
36
+ (?:\|(?P<label>[^[\]]+))? # optional |label
37
+ \]\] # ]] (end of link)
38
+ (?P<josa>[가-힣]+)? # optional josa
39
+ """
40
+
41
+ compiled_pattern = re.compile(pattern, re.VERBOSE)
42
+ return compiled_pattern
43
+
44
+
45
+ def fetch_backlink_documents(client: TheSeedAPIClient, pairs: list[tuple[str, str, josalib.JosaType]]) -> set[str]:
46
+ documents_to_edit: set[str] = set()
47
+
48
+ for before, _, _ in pairs:
49
+ fromm = None
50
+
51
+ while True:
52
+ response = client.backlink(
53
+ document=before,
54
+ namespace=None,
55
+ flag=specific.namuwiki.BacklinkType.ANY,
56
+ fromm=fromm,
57
+ )
58
+
59
+ if not response.backlinks:
60
+ break
61
+
62
+ documents_to_edit.update(link.document for link in response.backlinks)
63
+ fromm = response.until
64
+
65
+ return documents_to_edit
66
+
67
+
68
+ def catch_halt_signal(client: TheSeedAPIClient, halt_discuss_document: str) -> bool:
69
+ """Check for a halt signal in the discuss document."""
70
+ responses = client.discuss(halt_discuss_document)
71
+ first_normal_response = next((r for r in responses if r.status == "normal"), None)
72
+ if first_normal_response is not None:
73
+ return True
74
+ return False
75
+
76
+
77
+ class ThrottleJob(JobModule):
78
+ def __init__(self, wait_seconds: float = 1.0, *args: Any, **kwargs: Any) -> None:
79
+ super().__init__(*args, **kwargs)
80
+ self.wait_seconds = max(0.0, float(wait_seconds))
81
+
82
+ def __call__(self) -> None:
83
+ time.sleep(self.wait_seconds)
84
+
85
+
86
+ class CatchHaltSignalJob(JobModule):
87
+ def __init__(self, halt_discuss_document: str, *args: Any, **kwargs: Any) -> None:
88
+ super().__init__(*args, **kwargs)
89
+ self.halt_discuss_document = halt_discuss_document
90
+ self._pre_hooks: list[Callable[[], Any]] = []
91
+
92
+ def register_pre_hook(self, hook: Callable[[], Any]) -> None:
93
+ self._pre_hooks.append(hook)
94
+
95
+ def __call__(self) -> bool:
96
+ for hook in self._pre_hooks:
97
+ hook()
98
+ return catch_halt_signal(self.client, self.halt_discuss_document)
99
+
100
+
101
+ class BacklinkCleanupJob(JobModule):
102
+ HALT_CHECK_INTERVAL = 100
103
+
104
+ def __init__(
105
+ self,
106
+ client: TheSeedAPIClient,
107
+ halt_discuss_document: str,
108
+ pairs: list[tuple[str, str, josalib.JosaType]],
109
+ throttle_seconds: float = 1.0,
110
+ *args: Any,
111
+ **kwargs: Any,
112
+ ):
113
+ super().__init__(*args, **kwargs)
114
+ self.client = client
115
+ self.pairs = pairs
116
+ self.patterns = self._compile_patterns(pairs)
117
+ self.throttler = ThrottleJob(wait_seconds=throttle_seconds)
118
+ self.catcher = CatchHaltSignalJob(halt_discuss_document=halt_discuss_document)
119
+ self.catcher.register_pre_hook(self.throttler)
120
+
121
+ self._reset_history()
122
+
123
+ def get_history_summary(self) -> dict[str, Any]:
124
+ """Get a summary of the job execution history with additional metrics."""
125
+ total_processed = self.history["success"] + self.history["failure"] + self.history["skipped"]
126
+ return {
127
+ **self.history,
128
+ "total_processed": total_processed,
129
+ "success_rate": safe_div(self.history["success"], total_processed),
130
+ "avg_occurrences_per_document": safe_div(self.history["occurrences"], total_processed),
131
+ }
132
+
133
+ def _compile_patterns(self, pairs: list[tuple[str, str, josalib.JosaType]]) -> re.Pattern[str]:
134
+ self.logger.debug(
135
+ "Compiling regex patterns for backlink cleanup",
136
+ pattern_count=len(pairs),
137
+ patterns=[{"before": before, "after": after, "josa_type": josa_type.value} for before, after, josa_type in pairs],
138
+ )
139
+
140
+ before_patterns = "|".join(rf"(?P<before_{i}>{re.escape(before)})" for i, (before, _, _) in enumerate(pairs))
141
+
142
+ pattern = rf"""
143
+ \[\[ # [[ (start of link)
144
+ \s* # optional whitespace
145
+ (?:{before_patterns}) # one of the before patterns
146
+ \s* # optional whitespace
147
+ (?:\|(?P<label>[^[\]]+))? # optional |label
148
+ \]\] # ]] (end of link)
149
+ (?P<josa>[가-힣]+)? # optional josa
150
+ """
151
+
152
+ compiled_pattern = re.compile(pattern, re.VERBOSE)
153
+ self.logger.debug(
154
+ "Successfully compiled regex pattern for backlink cleanup",
155
+ pattern=pattern,
156
+ )
157
+ return compiled_pattern
158
+
159
+ def execute(self, *args: Any, **kwargs: Any) -> Any:
160
+ self.logger.info("Starting backlink cleanup job")
161
+ documents_to_edit = sorted(self._fetch_backlink_documents())
162
+
163
+ self.logger.info("Beginning document processing", total_documents=len(documents_to_edit))
164
+ try:
165
+ self._process_documents(documents_to_edit)
166
+ except Exception as e:
167
+ self.logger.error(
168
+ "Backlink cleanup job failed",
169
+ error=str(e),
170
+ error_type=type(e).__name__,
171
+ final_stats=self.get_history_summary(),
172
+ )
173
+ raise
174
+ else:
175
+ self.logger.info(
176
+ "Backlink cleanup job completed",
177
+ final_stats=self.get_history_summary(),
178
+ )
179
+
180
+ def _reset_history(self) -> None:
181
+ """Reset history counters for this execution."""
182
+ self.history = {
183
+ "success": 0,
184
+ "failure": 0,
185
+ "skipped": 0,
186
+ "occurrences": 0,
187
+ }
188
+
189
+ def _process_documents(self, documents_to_edit: list[str]) -> None:
190
+ """Process all documents in the list."""
191
+ for i, document in enumerate(documents_to_edit):
192
+ if self._should_check_halt_signal(i):
193
+ if self._check_halt_and_maybe_return(i):
194
+ return
195
+
196
+ self._log_document_progress(document, i, len(documents_to_edit))
197
+ self._process_single_document(document)
198
+
199
+ def _should_check_halt_signal(self, index: int) -> bool:
200
+ """Check if we should check for halt signal at this iteration."""
201
+ return index % self.HALT_CHECK_INTERVAL == 0
202
+
203
+ def _check_halt_and_maybe_return(self, processed_count: int) -> bool:
204
+ """Check for halt signal and return True if should terminate early."""
205
+ self.throttler()
206
+ if self.catcher():
207
+ self.logger.info(
208
+ "Halt signal detected - terminating job early",
209
+ processed_documents=processed_count,
210
+ final_stats=self.get_history_summary(),
211
+ )
212
+ return True
213
+ return False
214
+
215
+ def _log_document_progress(self, document: str, index: int, total: int) -> None:
216
+ """Log progress for current document."""
217
+ self.logger.debug(
218
+ "Processing document",
219
+ document=document,
220
+ progress=f"{index + 1}/{total}",
221
+ percentage=f"{(index + 1) / total: 6.2%}",
222
+ )
223
+
224
+ def _process_single_document(self, document: str) -> None:
225
+ """Process a single document with error handling."""
226
+ try:
227
+ self._edit_document(document)
228
+ except Exception as e:
229
+ self.logger.error(
230
+ "Failed to edit document",
231
+ document=document,
232
+ error=str(e),
233
+ error_type=type(e).__name__,
234
+ )
235
+ self.history["failure"] += 1
236
+
237
+ def _fetch_backlink_documents(self) -> set[str]:
238
+ self.logger.debug("Starting backlink document collection")
239
+ documents_to_edit: set[str] = set()
240
+
241
+ for before, _, _ in self.pairs:
242
+ self.logger.debug("Fetching backlinks for document", document=before)
243
+ fromm = None
244
+
245
+ while True:
246
+ self.logger.debug(
247
+ "Requesting backlink batch",
248
+ source_document=before,
249
+ from_cursor=fromm,
250
+ )
251
+
252
+ self.throttler()
253
+ response = self.client.backlink(
254
+ document=before,
255
+ namespace=None,
256
+ flag=specific.namuwiki.BacklinkType.ANY,
257
+ fromm=fromm,
258
+ )
259
+
260
+ if not response.backlinks:
261
+ self.logger.debug(
262
+ "Reached end of backlinks",
263
+ source_document=before,
264
+ final_cursor=response.fromm,
265
+ )
266
+ break
267
+ else:
268
+ self.logger.debug(
269
+ "Retrieved backlink batch",
270
+ source_document=before,
271
+ batch_size=len(response.backlinks),
272
+ from_cursor=response.fromm,
273
+ until_cursor=response.until,
274
+ )
275
+
276
+ documents_to_edit.update(link.document for link in response.backlinks)
277
+ fromm = response.until
278
+
279
+ self.logger.info(
280
+ "Backlink document collection completed",
281
+ total_unique_documents=len(documents_to_edit),
282
+ source_documents=[before for before, _, _ in self.pairs],
283
+ )
284
+
285
+ return documents_to_edit
286
+
287
+ def _replace_link(self, m: re.Match[str]) -> str:
288
+ for i, (before, after, josa_type) in enumerate(self.pairs):
289
+ if m.group(f"before_{i}") is None:
290
+ continue
291
+
292
+ josa_old = m.group("josa")
293
+ label = m.group("label")
294
+
295
+ self.logger.debug(
296
+ "Processing link replacement",
297
+ before=before,
298
+ after=after,
299
+ original_josa=josa_old,
300
+ original_label=label,
301
+ josa_type=josa_type.value,
302
+ )
303
+
304
+ if josa_old is None:
305
+ josa_new = ""
306
+ else:
307
+ try:
308
+ fitter = josalib.get_josa_fitter(josa_type)
309
+ josa_new = fitter(josa_old)
310
+ self.logger.debug(
311
+ "Josa transformation successful",
312
+ original_josa=josa_old,
313
+ transformed_josa=josa_new,
314
+ )
315
+ except josalib.AmbiguousJosaError:
316
+ self.logger.debug(
317
+ "Ambiguous josa encountered - preserving original",
318
+ original_josa=josa_old,
319
+ fallback_label=before,
320
+ )
321
+ label, josa_new = before, josa_old
322
+
323
+ if label is None or label == after:
324
+ result = f"[[{after}]]{josa_new}"
325
+ else:
326
+ result = f"[[{after}|{label}]]{josa_new}"
327
+
328
+ self.logger.debug(
329
+ "Link replacement completed",
330
+ original_match=m.group(0),
331
+ replacement=result,
332
+ )
333
+ return result
334
+
335
+ assert False
336
+
337
+ def _edit_document(self, document: str):
338
+ self.logger.debug("Starting document edit", document=document)
339
+ self.throttler()
340
+ response = self.client.edit_get(document=document)
341
+
342
+ if not response.exists:
343
+ self._handle_nonexistent_document(document)
344
+ return
345
+
346
+ original_text = response.text
347
+ edited_text = self.patterns.sub(self._replace_link, original_text)
348
+
349
+ matches = list(self.patterns.finditer(original_text))
350
+ self.history["occurrences"] += len(matches)
351
+
352
+ if original_text == edited_text:
353
+ self._handle_no_changes_needed(document, matches)
354
+ return
355
+
356
+ self._submit_document_changes(document, original_text, edited_text, matches, response.token)
357
+
358
+ def _handle_nonexistent_document(self, document: str) -> None:
359
+ """Handle case where document doesn't exist."""
360
+ self.logger.debug("Document does not exist - skipping edit", document=document)
361
+ self.history["failure"] += 1
362
+
363
+ def _handle_no_changes_needed(self, document: str, matches: list[re.Match[str]]) -> None:
364
+ """Handle case where no changes are needed."""
365
+ self.logger.debug(
366
+ "No changes needed for document",
367
+ document=document,
368
+ pattern_matches=len(matches),
369
+ )
370
+ self.history["skipped"] += 1
371
+
372
+ def _submit_document_changes(
373
+ self,
374
+ document: str,
375
+ original_text: str,
376
+ edited_text: str,
377
+ matches: list[re.Match[str]],
378
+ token: str,
379
+ ) -> None:
380
+ """Submit the document changes to the API."""
381
+ self.logger.debug(
382
+ "Submitting document changes",
383
+ document=document,
384
+ pattern_matches=len(matches),
385
+ text_length_change=len(edited_text) - len(original_text),
386
+ )
387
+
388
+ try:
389
+ self.throttler()
390
+ response = self.client.edit_post(
391
+ document=document,
392
+ body=EditPOSTBody(
393
+ text=edited_text,
394
+ log="🌳🤖",
395
+ token=token,
396
+ ),
397
+ )
398
+ self._handle_successful_edit(document, response.rev)
399
+ except Exception as e:
400
+ self._handle_failed_edit(document, e)
401
+
402
+ def _handle_successful_edit(self, document: str, revision: Any) -> None:
403
+ """Handle successful document edit."""
404
+ self.history["success"] += 1
405
+ self.logger.debug(
406
+ "Document edit completed successfully",
407
+ document=document,
408
+ revision=revision,
409
+ )
410
+
411
+ def _handle_failed_edit(self, document: str, error: Exception) -> None:
412
+ """Handle failed document edit."""
413
+ self.history["failure"] += 1
414
+ self.logger.error(
415
+ "Failed to submit document edit",
416
+ document=document,
417
+ error=str(error),
418
+ error_type=type(error).__name__,
419
+ )
420
+ raise
@@ -0,0 +1,3 @@
1
+ from .parser import get_parser, parse_markup
2
+
3
+ __all__ = ["get_parser", "parse_markup"]
@@ -0,0 +1,31 @@
1
+ from importlib.resources import files
2
+ from functools import lru_cache
3
+
4
+ from lark import Lark, Tree
5
+
6
+
7
+ def _load_grammar() -> str:
8
+ return files("pynamubot.parse").joinpath("theseed.lark").read_text(encoding="utf-8")
9
+
10
+
11
+ @lru_cache(maxsize=1)
12
+ def get_parser() -> Lark:
13
+ try:
14
+ return Lark(_load_grammar(), parser="lalr")
15
+ except Exception as exc:
16
+ raise RuntimeError("The bundled TheSeed grammar is incomplete and cannot be loaded yet.") from exc
17
+
18
+
19
+ def parse_markup(text: str) -> Tree:
20
+ return get_parser().parse(text)
21
+
22
+
23
+ class _LazyParser:
24
+ def parse(self, text: str) -> Tree:
25
+ return get_parser().parse(text)
26
+
27
+ def __getattr__(self, name: str):
28
+ return getattr(get_parser(), name)
29
+
30
+
31
+ parser = _LazyParser()
@@ -0,0 +1,71 @@
1
+ start: document
2
+
3
+ document: redirect_document | namu_mark
4
+
5
+ redirect_document: REDIRECT_MARKER internal_link
6
+ internal_link: ANY
7
+
8
+ namu_mark: (namu_mark_atom | NEWLINE)*
9
+ namu_mark_no_newline: namu_mark_atom*
10
+ namu_mark_atom: paragraph
11
+ | macro
12
+ | internal_link
13
+ | external_link
14
+ | file
15
+ | classification
16
+
17
+ paragraph: paragraph_level1_open
18
+ | paragraph_level2_open
19
+ | paragraph_level3_open
20
+ | paragraph_level4_open
21
+ | paragraph_level5_open
22
+ | paragraph_level6_open
23
+ | paragraph_level1_closed
24
+ | paragraph_level2_closed
25
+ | paragraph_level3_closed
26
+ | paragraph_level4_closed
27
+ | paragraph_level5_closed
28
+ | paragraph_level6_closed
29
+
30
+ paragraph_level1_open: "= " namu_mark_no_newline " ="
31
+ paragraph_level2_open: "== " namu_mark_no_newline " =="
32
+ paragraph_level3_open: "=== " namu_mark_no_newline " ==="
33
+ paragraph_level4_open: "==== " namu_mark_no_newline " ===="
34
+ paragraph_level5_open: "===== " namu_mark_no_newline " ====="
35
+ paragraph_level6_open: "====== " namu_mark_no_newline " ======"
36
+ paragraph_level1_closed: "=# " namu_mark_no_newline " #="
37
+ paragraph_level2_closed: "==# " namu_mark_no_newline " #=="
38
+ paragraph_level3_closed: "===# " namu_mark_no_newline " #==="
39
+ paragraph_level4_closed: "====# " namu_mark_no_newline " #===="
40
+ paragraph_level5_closed: "=====# " namu_mark_no_newline " #====="
41
+ paragraph_level6_closed: "======# " namu_mark_no_newline " #======"
42
+
43
+ macro: "[" macro_name ("(" arguments ")")? "]"
44
+ macro_name: "age"i
45
+ | "anchor"i
46
+ | "dday"i
47
+ | "include"i
48
+ | "youtube"i
49
+ | "nicovideo"i
50
+ | "kakaotv"i
51
+ | "pagecount"i
52
+ | "navertv"i
53
+ | "vimeo"i
54
+ | "br"i
55
+ | "clearfix"i
56
+ | "date"i
57
+ | "datetime"i
58
+ | "footnote"i
59
+ | "tableofcontents"i
60
+ | "각주"
61
+ | "목차"
62
+
63
+ arguments: ANY
64
+
65
+ NEWLINE: "\n"
66
+ REDIRECT_MARKER: "#redirect"i | "#넘겨주기"
67
+ EXTERNAL_LINK_PROTOCOL: "http"i
68
+ | "https"i
69
+ | "ftp"i
70
+ ANY: /.+/
71
+ _ANY: /.+/
@@ -0,0 +1,3 @@
1
+ from .puppet import create_driver
2
+
3
+ __all__ = ["create_driver"]
@@ -0,0 +1,39 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
5
+
6
+
7
+ def create_driver(browser_name: str, *, headless: bool = True) -> "RemoteWebDriver":
8
+ browser = browser_name.lower()
9
+
10
+ if browser == "chrome":
11
+ from selenium.webdriver import Chrome, ChromeOptions
12
+
13
+ options = ChromeOptions()
14
+ if headless:
15
+ options.add_argument("--headless=new")
16
+ return Chrome(options=options)
17
+
18
+ if browser == "edge":
19
+ from selenium.webdriver import Edge, EdgeOptions
20
+
21
+ options = EdgeOptions()
22
+ if headless:
23
+ options.add_argument("--headless=new")
24
+ return Edge(options=options)
25
+
26
+ if browser == "firefox":
27
+ from selenium.webdriver import Firefox, FirefoxOptions
28
+
29
+ options = FirefoxOptions()
30
+ if headless:
31
+ options.add_argument("-headless")
32
+ return Firefox(options=options)
33
+
34
+ if browser == "safari":
35
+ from selenium.webdriver import Safari
36
+
37
+ return Safari()
38
+
39
+ raise ValueError(f"Unsupported browser: {browser_name}")
File without changes
@@ -0,0 +1,3 @@
1
+ from . import alphawiki, namuwiki, theseedwiki
2
+
3
+ __all__ = ["alphawiki", "namuwiki", "theseedwiki"]
@@ -0,0 +1,22 @@
1
+ import enum
2
+
3
+
4
+ class NamespaceType(enum.Enum):
5
+ CATEGORY = "분류"
6
+ DOCUMENT = "문서"
7
+ FRAME = "틀"
8
+ FILE = "파일"
9
+ TEMPLATE = "템플릿"
10
+ USER = "사용자"
11
+ META = "알파위키"
12
+ TRASH = "휴지통"
13
+ SYSTEM = "시스템"
14
+ FILE_TRASH = "파일휴지통"
15
+
16
+
17
+ class BacklinkType(enum.IntEnum):
18
+ ANY = 0
19
+ LINK = 1
20
+ FILE = 2
21
+ INCLUDE = 4
22
+ REDIRECT = 8
@@ -0,0 +1,22 @@
1
+ import enum
2
+
3
+
4
+ class NamespaceType(enum.Enum):
5
+ CATEGORY = "분류"
6
+ DOCUMENT = "문서"
7
+ FRAME = "틀"
8
+ FILE = "파일"
9
+ TEMPLATE = "템플릿"
10
+ USER = "사용자"
11
+ META = "나무위키"
12
+ TRASH = "휴지통"
13
+ SYSTEM = "시스템"
14
+ FILE_TRASH = "파일휴지통"
15
+
16
+
17
+ class BacklinkType(enum.IntEnum):
18
+ ANY = 0
19
+ LINK = 1
20
+ FILE = 2
21
+ INCLUDE = 4
22
+ REDIRECT = 8
@@ -0,0 +1,22 @@
1
+ import enum
2
+
3
+
4
+ class NamespaceType(enum.Enum):
5
+ CATEGORY = "분류"
6
+ DOCUMENT = "문서"
7
+ FRAME = "틀"
8
+ FILE = "파일"
9
+ TEMPLATE = "템플릿"
10
+ USER = "사용자"
11
+ META = "더시드위키"
12
+ TRASH = "휴지통"
13
+ SYSTEM = "시스템"
14
+ FILE_TRASH = "파일휴지통"
15
+
16
+
17
+ class BacklinkType(enum.IntEnum):
18
+ ANY = 0
19
+ LINK = 1
20
+ FILE = 2
21
+ INCLUDE = 4
22
+ REDIRECT = 8
File without changes
@@ -0,0 +1,201 @@
1
+ from enum import Enum
2
+ from typing import Callable, Union
3
+
4
+ import structlog
5
+
6
+ logger = structlog.get_logger(__name__)
7
+
8
+ Fitter = Callable[[str], str]
9
+
10
+
11
+ class JosaError(Exception): ...
12
+
13
+
14
+ class AmbiguousJosaError(JosaError): ...
15
+
16
+
17
+ class JosaType(Enum):
18
+ HAS_JONGSEONG = "HAS_JONGSEONG"
19
+ NO_JONGSEONG = "NO_JONGSEONG"
20
+ UNKNOWN = "UNKNOWN"
21
+
22
+
23
+ _HAS_TO_NO = [
24
+ ("과", "와"),
25
+ ("아", "야"),
26
+ ("으로", "로"),
27
+ ("으론", "론"),
28
+ ("은", "는"),
29
+ ("을", "를"),
30
+ ("이", "가"),
31
+ ("이다", "이다"), # No change for this case
32
+ ("이며", "이며"), # No change for this case
33
+ ("이면", "면"), # Remove '이' for this case
34
+ ("이었", "였"),
35
+ ("이고", "고"),
36
+ ("이나", "나"),
37
+ ("이든", "든"),
38
+ ("이라", "라"),
39
+ ("이란", "란"),
40
+ ("이랑", "랑"),
41
+ ("이셔", "셔"),
42
+ ("이셨", "셨"),
43
+ ("이시여", "시여"),
44
+ ("이여", "여"),
45
+ ("이래", "래"),
46
+ ("이랬", "랬"),
47
+ ("이렷", "렷"),
48
+ ("이로", "로"),
49
+ ("이야", "야"),
50
+ ("아", "야"),
51
+ ]
52
+
53
+ _NO_TO_HAS = [
54
+ ("와", "과"),
55
+ ("로", "으로"),
56
+ ("론", "으론"),
57
+ ("는", "은"),
58
+ ("를", "을"),
59
+ ("가", "이"),
60
+ ("다", "이다"), # Add '이' for this case
61
+ ("며", "이며"), # Add '이' for this case
62
+ ("면", "이면"), # Add '이' for this case
63
+ ("였", "이었"),
64
+ ("고", "이고"),
65
+ ("나", "이나"),
66
+ ("든", "이든"),
67
+ ("라", "이라"),
68
+ ("란", "이란"),
69
+ ("랑", "이랑"),
70
+ ("셔", "이셔"),
71
+ ("셨", "이셨"),
72
+ ("시", "이시"),
73
+ ("여", "이여"),
74
+ ("래", "이래"),
75
+ ("랬", "이랬"),
76
+ ("렷", "이렷"),
77
+ ("야", "아"), # Ambiguous: could be '아' or '이야'
78
+ ("야", "이야"), # Ambiguous: could be '아' or '이야'
79
+ ("로서", "으로서"),
80
+ ("로써", "으로써"),
81
+ ]
82
+
83
+
84
+ def get_josa_type(string: str) -> JosaType:
85
+ logger.debug("Determining josa type", string=string)
86
+
87
+ if not string:
88
+ logger.debug("Empty string, returning UNKNOWN")
89
+ return JosaType.UNKNOWN
90
+
91
+ last = string[-1]
92
+ logger.debug("Last character analysis", char=last, unicode_code=ord(last))
93
+
94
+ if not "가" <= last <= "힣":
95
+ logger.debug("Character not in Hangul range, returning UNKNOWN")
96
+ return JosaType.UNKNOWN
97
+
98
+ jongseong = (ord(last) - 44032) % 28
99
+
100
+ if jongseong == 0:
101
+ logger.debug("No jongseong detected", char=last, result="NO_JONGSEONG")
102
+ return JosaType.NO_JONGSEONG
103
+ else:
104
+ logger.debug(
105
+ "Jongseong detected",
106
+ char=last,
107
+ jongseong_index=jongseong,
108
+ result="HAS_JONGSEONG",
109
+ )
110
+ return JosaType.HAS_JONGSEONG
111
+
112
+
113
+ def to_has_jongseong(josa: str) -> str:
114
+ logger.debug("Converting josa to has_jongseong form", input_josa=josa)
115
+ old_to_new = _NO_TO_HAS
116
+
117
+ if josa == "야":
118
+ logger.info("Encountered ambiguous josa '야'", josa=josa)
119
+ raise AmbiguousJosaError()
120
+
121
+ new_josa_candidates: list[str] = []
122
+ for old_josa, new_josa in old_to_new:
123
+ if josa == old_josa:
124
+ logger.debug("Exact match found", old_josa=old_josa, new_josa=new_josa)
125
+ return new_josa
126
+ if josa.startswith(old_josa):
127
+ candidate = new_josa + josa[len(old_josa) :]
128
+ new_josa_candidates.append(candidate)
129
+ logger.debug(
130
+ "Partial match found",
131
+ old_josa=old_josa,
132
+ new_josa=new_josa,
133
+ candidate=candidate,
134
+ )
135
+
136
+ if not new_josa_candidates:
137
+ logger.debug("No matches found, returning original", result=josa)
138
+ return josa
139
+ else:
140
+ result = new_josa_candidates[-1]
141
+ logger.debug(
142
+ "Multiple candidates found, returning last",
143
+ candidates=new_josa_candidates,
144
+ result=result,
145
+ )
146
+ return result
147
+
148
+
149
+ def to_no_jongseong(josa: str) -> str:
150
+ logger.debug("Converting josa to no_jongseong form", input_josa=josa)
151
+ old_to_new = _HAS_TO_NO
152
+
153
+ new_josa_candidates: list[str] = []
154
+ for old_josa, new_josa in old_to_new:
155
+ if josa == old_josa:
156
+ logger.debug("Exact match found", old_josa=old_josa, new_josa=new_josa)
157
+ return new_josa
158
+ if josa.startswith(old_josa):
159
+ candidate = new_josa + josa[len(old_josa) :]
160
+ new_josa_candidates.append(candidate)
161
+ logger.debug(
162
+ "Partial match found",
163
+ old_josa=old_josa,
164
+ new_josa=new_josa,
165
+ candidate=candidate,
166
+ )
167
+
168
+ if not new_josa_candidates:
169
+ if josa.startswith("이"):
170
+ result = josa[1:]
171
+ logger.debug("Fallback: removed leading '이'", original=josa, result=result)
172
+ return result
173
+ else:
174
+ logger.debug("No matches found, returning original", result=josa)
175
+ return josa
176
+ else:
177
+ result = new_josa_candidates[-1]
178
+ logger.debug(
179
+ "Multiple candidates found, returning last",
180
+ candidates=new_josa_candidates,
181
+ result=result,
182
+ )
183
+ return result
184
+
185
+
186
+ def get_josa_fitter(string: Union[str, JosaType]) -> Fitter:
187
+ josa_type = string if isinstance(string, JosaType) else get_josa_type(string)
188
+ logger.debug("Getting josa fitter", input=string, josa_type=josa_type)
189
+
190
+ if josa_type == JosaType.UNKNOWN:
191
+ logger.info("Unknown josa type encountered", input=string)
192
+ raise ValueError("Unknown josa type")
193
+ elif josa_type == JosaType.HAS_JONGSEONG:
194
+ logger.debug("Returning to_no_jongseong fitter")
195
+ return to_no_jongseong
196
+ elif josa_type == JosaType.NO_JONGSEONG:
197
+ logger.debug("Returning to_has_jongseong fitter")
198
+ return to_has_jongseong
199
+ else:
200
+ logger.info("Unknown josa type encountered", input=string)
201
+ raise ValueError("Unknown josa type")
@@ -0,0 +1,16 @@
1
+ def safe_div(numerator: float, denominator: float, safe_value: float = 0.0) -> float:
2
+ """
3
+ Safely divides two numbers, returning a safe value if the denominator is zero.
4
+
5
+ Args:
6
+ numerator (float): The numerator of the division.
7
+ denominator (float): The denominator of the division.
8
+ safe_value (float, optional): The value to return if the denominator is zero. Defaults to 0.0.
9
+
10
+ Returns:
11
+ float: The result of the division or the safe value if the denominator is zero.
12
+ """
13
+ try:
14
+ return numerator / denominator
15
+ except ZeroDivisionError:
16
+ return safe_value