pynamubot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pynamubot-0.1.0/PKG-INFO +66 -0
- pynamubot-0.1.0/README.md +38 -0
- pynamubot-0.1.0/pyproject.toml +38 -0
- pynamubot-0.1.0/src/pynamubot/__init__.py +4 -0
- pynamubot-0.1.0/src/pynamubot/__version__.py +6 -0
- pynamubot-0.1.0/src/pynamubot/api/__init__.py +4 -0
- pynamubot-0.1.0/src/pynamubot/api/api.py +172 -0
- pynamubot-0.1.0/src/pynamubot/api/schemas.py +57 -0
- pynamubot-0.1.0/src/pynamubot/jobs/__init__.py +0 -0
- pynamubot-0.1.0/src/pynamubot/jobs/job_core.py +35 -0
- pynamubot-0.1.0/src/pynamubot/jobs/jobs.py +420 -0
- pynamubot-0.1.0/src/pynamubot/parse/__init__.py +3 -0
- pynamubot-0.1.0/src/pynamubot/parse/parser.py +31 -0
- pynamubot-0.1.0/src/pynamubot/parse/theseed.lark +71 -0
- pynamubot-0.1.0/src/pynamubot/puppet/__init__.py +3 -0
- pynamubot-0.1.0/src/pynamubot/puppet/puppet.py +39 -0
- pynamubot-0.1.0/src/pynamubot/py.typed +0 -0
- pynamubot-0.1.0/src/pynamubot/specific/__init__.py +3 -0
- pynamubot-0.1.0/src/pynamubot/specific/alphawiki.py +22 -0
- pynamubot-0.1.0/src/pynamubot/specific/namuwiki.py +22 -0
- pynamubot-0.1.0/src/pynamubot/specific/theseedwiki.py +22 -0
- pynamubot-0.1.0/src/pynamubot/utils/__init__.py +0 -0
- pynamubot-0.1.0/src/pynamubot/utils/josalib.py +201 -0
- pynamubot-0.1.0/src/pynamubot/utils/misc.py +16 -0
pynamubot-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pynamubot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python client library for TheSeed-based wiki APIs such as NamuWiki.
|
|
5
|
+
Keywords: namuwiki,theseed,wiki,api,bot
|
|
6
|
+
Author: Iodine at NamuWiki
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Dist: lark>=1.3.1
|
|
20
|
+
Requires-Dist: pydantic>=2.12.5
|
|
21
|
+
Requires-Dist: requests>=2.32.5
|
|
22
|
+
Requires-Dist: structlog>=25.5.0
|
|
23
|
+
Requires-Dist: typing-extensions>=4.9.0
|
|
24
|
+
Requires-Dist: selenium>=4.0.0 ; extra == 'puppet'
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Provides-Extra: puppet
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# pynamubot
|
|
30
|
+
|
|
31
|
+
Python client library for TheSeed-based wiki APIs (for example, NamuWiki).
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install pynamubot
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
If you need Selenium-based utilities:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install "pynamubot[puppet]"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quick start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from pynamubot.api import TheSeedAPIClient
|
|
49
|
+
|
|
50
|
+
client = TheSeedAPIClient(
|
|
51
|
+
base_url="https://namu.wiki/api",
|
|
52
|
+
api_token="YOUR_API_TOKEN",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
response = client.edit_get("TestDocument")
|
|
56
|
+
print(response.exists, response.token)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Requirements
|
|
60
|
+
|
|
61
|
+
- Python 3.9+
|
|
62
|
+
- A valid API token issued by the target wiki
|
|
63
|
+
|
|
64
|
+
## Reference
|
|
65
|
+
|
|
66
|
+
- TheSeed API docs: <https://doc.theseed.io/>
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# pynamubot
|
|
2
|
+
|
|
3
|
+
Python client library for TheSeed-based wiki APIs (for example, NamuWiki).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install pynamubot
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
If you need Selenium-based utilities:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install "pynamubot[puppet]"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick start
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from pynamubot.api import TheSeedAPIClient
|
|
21
|
+
|
|
22
|
+
client = TheSeedAPIClient(
|
|
23
|
+
base_url="https://namu.wiki/api",
|
|
24
|
+
api_token="YOUR_API_TOKEN",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
response = client.edit_get("TestDocument")
|
|
28
|
+
print(response.exists, response.token)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Requirements
|
|
32
|
+
|
|
33
|
+
- Python 3.9+
|
|
34
|
+
- A valid API token issued by the target wiki
|
|
35
|
+
|
|
36
|
+
## Reference
|
|
37
|
+
|
|
38
|
+
- TheSeed API docs: <https://doc.theseed.io/>
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pynamubot"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Python client library for TheSeed-based wiki APIs such as NamuWiki."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Iodine at NamuWiki" },
|
|
10
|
+
]
|
|
11
|
+
keywords = ["namuwiki", "theseed", "wiki", "api", "bot"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
18
|
+
"Programming Language :: Python :: 3.9",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
23
|
+
"Typing :: Typed",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"lark>=1.3.1",
|
|
27
|
+
"pydantic>=2.12.5",
|
|
28
|
+
"requests>=2.32.5",
|
|
29
|
+
"structlog>=25.5.0",
|
|
30
|
+
"typing-extensions>=4.9.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
puppet = ["selenium>=4.0.0"]
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["uv_build>=0.9.7,<0.10.0"]
|
|
38
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TheSeedAPI client for interacting with the API endpoints.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from functools import cached_property
|
|
7
|
+
from types import TracebackType
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
import structlog
|
|
12
|
+
from typing_extensions import Self
|
|
13
|
+
|
|
14
|
+
from ..__version__ import __title__, __version__
|
|
15
|
+
from ..api.schemas import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Limiter:
|
|
19
|
+
"""
|
|
20
|
+
A simple rate limiter to enforce a minimum interval between requests.
|
|
21
|
+
|
|
22
|
+
Note that NamuWiki's soft rate limit is 1 request per second. Setting interval_seconds to 1.0 is recommended.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, interval_seconds: float) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Initialize the rate limiter.
|
|
28
|
+
|
|
29
|
+
:param interval_seconds: The minimum interval between requests in seconds. Setting this to 0 disables rate limiting by not calling sleep at all.
|
|
30
|
+
"""
|
|
31
|
+
self.interval = float(interval_seconds)
|
|
32
|
+
if self.interval <= 0.0:
|
|
33
|
+
self.acquire = lambda: None
|
|
34
|
+
self.last = float("-inf")
|
|
35
|
+
|
|
36
|
+
def acquire(self) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Acquire the limiter before making a request. This will block if necessary to enforce the rate limit.
|
|
39
|
+
"""
|
|
40
|
+
now = time.monotonic()
|
|
41
|
+
elapsed = now - self.last
|
|
42
|
+
wait = self.interval - elapsed
|
|
43
|
+
time.sleep(max(0.0, wait))
|
|
44
|
+
self.last = time.monotonic()
|
|
45
|
+
|
|
46
|
+
def __enter__(self) -> Self:
|
|
47
|
+
self.acquire()
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
def __exit__(self, exc_type: Optional[type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]) -> None:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class TheSeedAPIClient:
|
|
55
|
+
"""
|
|
56
|
+
Client for interacting with TheSeedAPI endpoints.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
base_url: str,
|
|
62
|
+
api_token: str,
|
|
63
|
+
logger: Optional[structlog.stdlib.BoundLogger] = None,
|
|
64
|
+
limiter: Optional[Limiter] = None,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Initialize TheSeedAPI with the API token and base URL.
|
|
68
|
+
|
|
69
|
+
:param base_url: The base URL for the API endpoints.
|
|
70
|
+
:param api_token: The API token string.
|
|
71
|
+
:param session: An optional requests session to use for the API calls. Defaults to None (creates a new session).
|
|
72
|
+
:param logger: An optional structlog logger. Defaults to None (creates a new logger).
|
|
73
|
+
:param limiter: An optional rate limiter. Defaults to None (no limiter).
|
|
74
|
+
"""
|
|
75
|
+
self.base_url = base_url.rstrip("/")
|
|
76
|
+
self.api_token = api_token
|
|
77
|
+
headers = {
|
|
78
|
+
"Authorization": f"Bearer {self.api_token}",
|
|
79
|
+
"Content-Type": "application/json",
|
|
80
|
+
"User-Agent": self.user_agent,
|
|
81
|
+
}
|
|
82
|
+
self.session = requests.Session()
|
|
83
|
+
self.session.headers.update(headers)
|
|
84
|
+
self.logger = logger if logger is not None else structlog.get_logger(__name__)
|
|
85
|
+
self.limiter = limiter if limiter is not None else Limiter(0.0)
|
|
86
|
+
|
|
87
|
+
@cached_property
|
|
88
|
+
def user_agent(self) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Get the user agent string for the API client.
|
|
91
|
+
|
|
92
|
+
:return: The user agent string.
|
|
93
|
+
"""
|
|
94
|
+
return f"{requests.utils.default_user_agent()} {__title__}/{__version__}"
|
|
95
|
+
|
|
96
|
+
def edit_get(self, document: str) -> EditGETResponse:
|
|
97
|
+
"""
|
|
98
|
+
Fetch the content of the document.
|
|
99
|
+
|
|
100
|
+
:param document: The document to fetch.
|
|
101
|
+
:return response: A dictionary containing the document text, existence status, and edit token.
|
|
102
|
+
"""
|
|
103
|
+
url = f"{self.base_url}/edit/{document}"
|
|
104
|
+
with self.limiter, self.session.get(url) as response:
|
|
105
|
+
response.raise_for_status()
|
|
106
|
+
return EditGETResponse.model_validate(response.json())
|
|
107
|
+
|
|
108
|
+
def edit_post(self, document: str, body: EditPOSTBody) -> EditPOSTResponse:
|
|
109
|
+
"""
|
|
110
|
+
Edit the document with new text.
|
|
111
|
+
|
|
112
|
+
:param document: The document to edit.
|
|
113
|
+
:param body: A dictionary containing the new text, log message, and edit token.
|
|
114
|
+
:return response: A dictionary containing the revision number of the edit.
|
|
115
|
+
"""
|
|
116
|
+
url = f"{self.base_url}/edit/{document}"
|
|
117
|
+
with self.limiter, self.session.post(url, json=body) as response:
|
|
118
|
+
response.raise_for_status()
|
|
119
|
+
return EditPOSTResponse.model_validate(response.json())
|
|
120
|
+
|
|
121
|
+
def backlink(
|
|
122
|
+
self,
|
|
123
|
+
document: str,
|
|
124
|
+
namespace: Optional[str] = None,
|
|
125
|
+
flag: Optional[int] = None,
|
|
126
|
+
fromm: Optional[str] = None,
|
|
127
|
+
until: Optional[str] = None,
|
|
128
|
+
) -> BacklinkResponse:
|
|
129
|
+
"""
|
|
130
|
+
Retrieve backlinks for the document.
|
|
131
|
+
|
|
132
|
+
Note: The behavior when both fromm and until are not None is not well-defined.
|
|
133
|
+
|
|
134
|
+
:param document: The document to retrieve backlinks for.
|
|
135
|
+
:param namespace: The namespace of documents to query.
|
|
136
|
+
:param flag: Filter on how document is linked to this document.
|
|
137
|
+
:param fromm: Paginate from this document (inclusive). Note the double 'm' to avoid Python keyword conflict.
|
|
138
|
+
:param until: Paginate until this document (inclusive).
|
|
139
|
+
:return response: A dictionary containing the namespaces, backlinks, and from/to information.
|
|
140
|
+
"""
|
|
141
|
+
url = f"{self.base_url}/backlink/{document}"
|
|
142
|
+
params: dict[str, Any] = {
|
|
143
|
+
"namespace": namespace,
|
|
144
|
+
"flag": flag,
|
|
145
|
+
"from": fromm,
|
|
146
|
+
"until": until,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
with self.limiter, self.session.get(url, params=params) as response:
|
|
150
|
+
response.raise_for_status()
|
|
151
|
+
return BacklinkResponse.model_validate(response.json())
|
|
152
|
+
|
|
153
|
+
def discuss(self, document: str) -> list[DiscussResponse]:
|
|
154
|
+
"""
|
|
155
|
+
Fetch discussions on the document.
|
|
156
|
+
|
|
157
|
+
:param document: The document to fetch discussions for.
|
|
158
|
+
:return response: A list of dictionaries containing the slug, topic, updated date, and status of the discussions.
|
|
159
|
+
"""
|
|
160
|
+
url = f"{self.base_url}/discuss/{document}"
|
|
161
|
+
with self.limiter, self.session.get(url) as response:
|
|
162
|
+
response.raise_for_status()
|
|
163
|
+
return [DiscussResponse.model_validate(item) for item in response.json()]
|
|
164
|
+
|
|
165
|
+
def __del__(self) -> None:
|
|
166
|
+
self.session.close()
|
|
167
|
+
|
|
168
|
+
def __enter__(self) -> Self:
|
|
169
|
+
return self
|
|
170
|
+
|
|
171
|
+
def __exit__(self, exc_type: Optional[type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]) -> None:
|
|
172
|
+
self.session.close()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Typed request and response schema definitions for TheSeedAPI.
|
|
3
|
+
|
|
4
|
+
This module defines multiple `TypedDict` classes that describe the
|
|
5
|
+
expected structure of JSON request bodies and response payloads
|
|
6
|
+
when interacting with TheSeedAPI using the `requests` library.
|
|
7
|
+
|
|
8
|
+
Each class is meant to serve as a static typing adi for improved
|
|
9
|
+
editor support, validation, and readability in API-related code.
|
|
10
|
+
|
|
11
|
+
See <https://doc.theseed.io/> for full API documentation.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import Literal, Optional
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EditGETResponse(BaseModel):
|
|
20
|
+
text: str = Field(description="The content of the document")
|
|
21
|
+
exists: bool = Field(description="Whether the document exists")
|
|
22
|
+
token: str = Field(description="The edit token (will be used for edit POST request)")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class EditPOSTBody(BaseModel):
|
|
26
|
+
text: str = Field(description="The edited document text")
|
|
27
|
+
log: str = Field(description="Edit summary")
|
|
28
|
+
token: str = Field(description="The edit token (from edit GET request)")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class EditPOSTResponse(BaseModel):
|
|
32
|
+
status: str = Field(description="The status of the edit operation")
|
|
33
|
+
rev: int = Field(description="The edited revision")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Namespaces(BaseModel):
|
|
37
|
+
namespace: str = Field(description="The namespace of the document")
|
|
38
|
+
count: int = Field(description="The number of documents in the namespace")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Backlinks(BaseModel):
|
|
42
|
+
document: str = Field(description="The document linked to")
|
|
43
|
+
flags: str = Field(description="Linking flags")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BacklinkResponse(BaseModel):
|
|
47
|
+
namespaces: list[Namespaces] = Field(description="List of namespaces")
|
|
48
|
+
backlinks: list[Backlinks] = Field(description="List of backlinks")
|
|
49
|
+
fromm: Optional[str] = Field(description="Starting point for the query")
|
|
50
|
+
until: Optional[str] = Field(description="Ending point for the query")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DiscussResponse(BaseModel):
|
|
54
|
+
slug: str = Field(description="The discussion slug")
|
|
55
|
+
topic: str = Field(description="The discussion topic")
|
|
56
|
+
updated_date: datetime = Field(description="The last comment's Unix timestamp")
|
|
57
|
+
status: Literal["normal", "close", "pause"] = Field(description="The discussion status")
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import structlog
|
|
5
|
+
|
|
6
|
+
from ..api import TheSeedAPIClient
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JobModule:
|
|
10
|
+
client: TheSeedAPIClient
|
|
11
|
+
|
|
12
|
+
def __init__(self, name: Optional[str] = None, logger: Optional[structlog.stdlib.BoundLogger] = None) -> None:
|
|
13
|
+
self._modules: OrderedDict[str, "JobModule"] = OrderedDict()
|
|
14
|
+
self.name = name or self.__class__.__name__
|
|
15
|
+
self.logger = logger or structlog.get_logger(self.name)
|
|
16
|
+
|
|
17
|
+
def __setattr__(self, key: str, value: Any) -> None:
|
|
18
|
+
super().__setattr__(key, value)
|
|
19
|
+
if isinstance(value, JobModule):
|
|
20
|
+
name, module = key, value
|
|
21
|
+
self._modules[name] = module
|
|
22
|
+
if hasattr(self, "client"):
|
|
23
|
+
self._propagate_attribute(module, "client", self.client)
|
|
24
|
+
|
|
25
|
+
def _propagate_attribute(self, module: "JobModule", attr_name: str, attr_value: Any) -> None:
|
|
26
|
+
if not hasattr(module, attr_name):
|
|
27
|
+
setattr(module, attr_name, attr_value)
|
|
28
|
+
for sub_module in module._modules.values():
|
|
29
|
+
self._propagate_attribute(sub_module, attr_name, attr_value)
|
|
30
|
+
|
|
31
|
+
def add_module(self, name: str, module: "JobModule") -> None:
|
|
32
|
+
"""Add a sub-module to this job module."""
|
|
33
|
+
self._modules[name] = module
|
|
34
|
+
if hasattr(self, "client"):
|
|
35
|
+
self._propagate_attribute(module, "client", self.client)
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Callable
|
|
4
|
+
|
|
5
|
+
from .. import specific
|
|
6
|
+
from ..api.api import TheSeedAPIClient
|
|
7
|
+
from ..api.schemas import *
|
|
8
|
+
from ..jobs.job_core import JobModule
|
|
9
|
+
from ..utils import josalib
|
|
10
|
+
from ..utils.misc import safe_div
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cleanup_backlinks(
|
|
14
|
+
client: TheSeedAPIClient,
|
|
15
|
+
halt_discuss_document: str,
|
|
16
|
+
pairs: list[tuple[str, str, josalib.JosaType]],
|
|
17
|
+
throttle_seconds: float = 1.0,
|
|
18
|
+
) -> None:
|
|
19
|
+
job = BacklinkCleanupJob(
|
|
20
|
+
client=client,
|
|
21
|
+
halt_discuss_document=halt_discuss_document,
|
|
22
|
+
pairs=pairs,
|
|
23
|
+
throttle_seconds=throttle_seconds,
|
|
24
|
+
)
|
|
25
|
+
job.execute()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def compile_pattern(pairs: list[tuple[str, str, josalib.JosaType]]) -> re.Pattern[str]:
|
|
29
|
+
before_patterns = "|".join(rf"(?P<before_{i}>{re.escape(before)})" for i, (before, _, _) in enumerate(pairs))
|
|
30
|
+
|
|
31
|
+
pattern = rf"""
|
|
32
|
+
\[\[ # [[ (start of link)
|
|
33
|
+
\s* # optional whitespace
|
|
34
|
+
(?:{before_patterns}) # one of the before patterns
|
|
35
|
+
\s* # optional whitespace
|
|
36
|
+
(?:\|(?P<label>[^[\]]+))? # optional |label
|
|
37
|
+
\]\] # ]] (end of link)
|
|
38
|
+
(?P<josa>[가-힣]+)? # optional josa
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
compiled_pattern = re.compile(pattern, re.VERBOSE)
|
|
42
|
+
return compiled_pattern
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def fetch_backlink_documents(client: TheSeedAPIClient, pairs: list[tuple[str, str, josalib.JosaType]]) -> set[str]:
|
|
46
|
+
documents_to_edit: set[str] = set()
|
|
47
|
+
|
|
48
|
+
for before, _, _ in pairs:
|
|
49
|
+
fromm = None
|
|
50
|
+
|
|
51
|
+
while True:
|
|
52
|
+
response = client.backlink(
|
|
53
|
+
document=before,
|
|
54
|
+
namespace=None,
|
|
55
|
+
flag=specific.namuwiki.BacklinkType.ANY,
|
|
56
|
+
fromm=fromm,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if not response.backlinks:
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
documents_to_edit.update(link.document for link in response.backlinks)
|
|
63
|
+
fromm = response.until
|
|
64
|
+
|
|
65
|
+
return documents_to_edit
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def catch_halt_signal(client: TheSeedAPIClient, halt_discuss_document: str) -> bool:
|
|
69
|
+
"""Check for a halt signal in the discuss document."""
|
|
70
|
+
responses = client.discuss(halt_discuss_document)
|
|
71
|
+
first_normal_response = next((r for r in responses if r.status == "normal"), None)
|
|
72
|
+
if first_normal_response is not None:
|
|
73
|
+
return True
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class ThrottleJob(JobModule):
|
|
78
|
+
def __init__(self, wait_seconds: float = 1.0, *args: Any, **kwargs: Any) -> None:
|
|
79
|
+
super().__init__(*args, **kwargs)
|
|
80
|
+
self.wait_seconds = max(0.0, float(wait_seconds))
|
|
81
|
+
|
|
82
|
+
def __call__(self) -> None:
|
|
83
|
+
time.sleep(self.wait_seconds)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class CatchHaltSignalJob(JobModule):
|
|
87
|
+
def __init__(self, halt_discuss_document: str, *args: Any, **kwargs: Any) -> None:
|
|
88
|
+
super().__init__(*args, **kwargs)
|
|
89
|
+
self.halt_discuss_document = halt_discuss_document
|
|
90
|
+
self._pre_hooks: list[Callable[[], Any]] = []
|
|
91
|
+
|
|
92
|
+
def register_pre_hook(self, hook: Callable[[], Any]) -> None:
|
|
93
|
+
self._pre_hooks.append(hook)
|
|
94
|
+
|
|
95
|
+
def __call__(self) -> bool:
|
|
96
|
+
for hook in self._pre_hooks:
|
|
97
|
+
hook()
|
|
98
|
+
return catch_halt_signal(self.client, self.halt_discuss_document)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class BacklinkCleanupJob(JobModule):
|
|
102
|
+
HALT_CHECK_INTERVAL = 100
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
client: TheSeedAPIClient,
|
|
107
|
+
halt_discuss_document: str,
|
|
108
|
+
pairs: list[tuple[str, str, josalib.JosaType]],
|
|
109
|
+
throttle_seconds: float = 1.0,
|
|
110
|
+
*args: Any,
|
|
111
|
+
**kwargs: Any,
|
|
112
|
+
):
|
|
113
|
+
super().__init__(*args, **kwargs)
|
|
114
|
+
self.client = client
|
|
115
|
+
self.pairs = pairs
|
|
116
|
+
self.patterns = self._compile_patterns(pairs)
|
|
117
|
+
self.throttler = ThrottleJob(wait_seconds=throttle_seconds)
|
|
118
|
+
self.catcher = CatchHaltSignalJob(halt_discuss_document=halt_discuss_document)
|
|
119
|
+
self.catcher.register_pre_hook(self.throttler)
|
|
120
|
+
|
|
121
|
+
self._reset_history()
|
|
122
|
+
|
|
123
|
+
def get_history_summary(self) -> dict[str, Any]:
|
|
124
|
+
"""Get a summary of the job execution history with additional metrics."""
|
|
125
|
+
total_processed = self.history["success"] + self.history["failure"] + self.history["skipped"]
|
|
126
|
+
return {
|
|
127
|
+
**self.history,
|
|
128
|
+
"total_processed": total_processed,
|
|
129
|
+
"success_rate": safe_div(self.history["success"], total_processed),
|
|
130
|
+
"avg_occurrences_per_document": safe_div(self.history["occurrences"], total_processed),
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
def _compile_patterns(self, pairs: list[tuple[str, str, josalib.JosaType]]) -> re.Pattern[str]:
|
|
134
|
+
self.logger.debug(
|
|
135
|
+
"Compiling regex patterns for backlink cleanup",
|
|
136
|
+
pattern_count=len(pairs),
|
|
137
|
+
patterns=[{"before": before, "after": after, "josa_type": josa_type.value} for before, after, josa_type in pairs],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
before_patterns = "|".join(rf"(?P<before_{i}>{re.escape(before)})" for i, (before, _, _) in enumerate(pairs))
|
|
141
|
+
|
|
142
|
+
pattern = rf"""
|
|
143
|
+
\[\[ # [[ (start of link)
|
|
144
|
+
\s* # optional whitespace
|
|
145
|
+
(?:{before_patterns}) # one of the before patterns
|
|
146
|
+
\s* # optional whitespace
|
|
147
|
+
(?:\|(?P<label>[^[\]]+))? # optional |label
|
|
148
|
+
\]\] # ]] (end of link)
|
|
149
|
+
(?P<josa>[가-힣]+)? # optional josa
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
compiled_pattern = re.compile(pattern, re.VERBOSE)
|
|
153
|
+
self.logger.debug(
|
|
154
|
+
"Successfully compiled regex pattern for backlink cleanup",
|
|
155
|
+
pattern=pattern,
|
|
156
|
+
)
|
|
157
|
+
return compiled_pattern
|
|
158
|
+
|
|
159
|
+
def execute(self, *args: Any, **kwargs: Any) -> Any:
|
|
160
|
+
self.logger.info("Starting backlink cleanup job")
|
|
161
|
+
documents_to_edit = sorted(self._fetch_backlink_documents())
|
|
162
|
+
|
|
163
|
+
self.logger.info("Beginning document processing", total_documents=len(documents_to_edit))
|
|
164
|
+
try:
|
|
165
|
+
self._process_documents(documents_to_edit)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
self.logger.error(
|
|
168
|
+
"Backlink cleanup job failed",
|
|
169
|
+
error=str(e),
|
|
170
|
+
error_type=type(e).__name__,
|
|
171
|
+
final_stats=self.get_history_summary(),
|
|
172
|
+
)
|
|
173
|
+
raise
|
|
174
|
+
else:
|
|
175
|
+
self.logger.info(
|
|
176
|
+
"Backlink cleanup job completed",
|
|
177
|
+
final_stats=self.get_history_summary(),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def _reset_history(self) -> None:
|
|
181
|
+
"""Reset history counters for this execution."""
|
|
182
|
+
self.history = {
|
|
183
|
+
"success": 0,
|
|
184
|
+
"failure": 0,
|
|
185
|
+
"skipped": 0,
|
|
186
|
+
"occurrences": 0,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
def _process_documents(self, documents_to_edit: list[str]) -> None:
|
|
190
|
+
"""Process all documents in the list."""
|
|
191
|
+
for i, document in enumerate(documents_to_edit):
|
|
192
|
+
if self._should_check_halt_signal(i):
|
|
193
|
+
if self._check_halt_and_maybe_return(i):
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
self._log_document_progress(document, i, len(documents_to_edit))
|
|
197
|
+
self._process_single_document(document)
|
|
198
|
+
|
|
199
|
+
def _should_check_halt_signal(self, index: int) -> bool:
|
|
200
|
+
"""Check if we should check for halt signal at this iteration."""
|
|
201
|
+
return index % self.HALT_CHECK_INTERVAL == 0
|
|
202
|
+
|
|
203
|
+
def _check_halt_and_maybe_return(self, processed_count: int) -> bool:
|
|
204
|
+
"""Check for halt signal and return True if should terminate early."""
|
|
205
|
+
self.throttler()
|
|
206
|
+
if self.catcher():
|
|
207
|
+
self.logger.info(
|
|
208
|
+
"Halt signal detected - terminating job early",
|
|
209
|
+
processed_documents=processed_count,
|
|
210
|
+
final_stats=self.get_history_summary(),
|
|
211
|
+
)
|
|
212
|
+
return True
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
def _log_document_progress(self, document: str, index: int, total: int) -> None:
|
|
216
|
+
"""Log progress for current document."""
|
|
217
|
+
self.logger.debug(
|
|
218
|
+
"Processing document",
|
|
219
|
+
document=document,
|
|
220
|
+
progress=f"{index + 1}/{total}",
|
|
221
|
+
percentage=f"{(index + 1) / total: 6.2%}",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def _process_single_document(self, document: str) -> None:
|
|
225
|
+
"""Process a single document with error handling."""
|
|
226
|
+
try:
|
|
227
|
+
self._edit_document(document)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
self.logger.error(
|
|
230
|
+
"Failed to edit document",
|
|
231
|
+
document=document,
|
|
232
|
+
error=str(e),
|
|
233
|
+
error_type=type(e).__name__,
|
|
234
|
+
)
|
|
235
|
+
self.history["failure"] += 1
|
|
236
|
+
|
|
237
|
+
def _fetch_backlink_documents(self) -> set[str]:
|
|
238
|
+
self.logger.debug("Starting backlink document collection")
|
|
239
|
+
documents_to_edit: set[str] = set()
|
|
240
|
+
|
|
241
|
+
for before, _, _ in self.pairs:
|
|
242
|
+
self.logger.debug("Fetching backlinks for document", document=before)
|
|
243
|
+
fromm = None
|
|
244
|
+
|
|
245
|
+
while True:
|
|
246
|
+
self.logger.debug(
|
|
247
|
+
"Requesting backlink batch",
|
|
248
|
+
source_document=before,
|
|
249
|
+
from_cursor=fromm,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
self.throttler()
|
|
253
|
+
response = self.client.backlink(
|
|
254
|
+
document=before,
|
|
255
|
+
namespace=None,
|
|
256
|
+
flag=specific.namuwiki.BacklinkType.ANY,
|
|
257
|
+
fromm=fromm,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
if not response.backlinks:
|
|
261
|
+
self.logger.debug(
|
|
262
|
+
"Reached end of backlinks",
|
|
263
|
+
source_document=before,
|
|
264
|
+
final_cursor=response.fromm,
|
|
265
|
+
)
|
|
266
|
+
break
|
|
267
|
+
else:
|
|
268
|
+
self.logger.debug(
|
|
269
|
+
"Retrieved backlink batch",
|
|
270
|
+
source_document=before,
|
|
271
|
+
batch_size=len(response.backlinks),
|
|
272
|
+
from_cursor=response.fromm,
|
|
273
|
+
until_cursor=response.until,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
documents_to_edit.update(link.document for link in response.backlinks)
|
|
277
|
+
fromm = response.until
|
|
278
|
+
|
|
279
|
+
self.logger.info(
|
|
280
|
+
"Backlink document collection completed",
|
|
281
|
+
total_unique_documents=len(documents_to_edit),
|
|
282
|
+
source_documents=[before for before, _, _ in self.pairs],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return documents_to_edit
|
|
286
|
+
|
|
287
|
+
def _replace_link(self, m: re.Match[str]) -> str:
|
|
288
|
+
for i, (before, after, josa_type) in enumerate(self.pairs):
|
|
289
|
+
if m.group(f"before_{i}") is None:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
josa_old = m.group("josa")
|
|
293
|
+
label = m.group("label")
|
|
294
|
+
|
|
295
|
+
self.logger.debug(
|
|
296
|
+
"Processing link replacement",
|
|
297
|
+
before=before,
|
|
298
|
+
after=after,
|
|
299
|
+
original_josa=josa_old,
|
|
300
|
+
original_label=label,
|
|
301
|
+
josa_type=josa_type.value,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
if josa_old is None:
|
|
305
|
+
josa_new = ""
|
|
306
|
+
else:
|
|
307
|
+
try:
|
|
308
|
+
fitter = josalib.get_josa_fitter(josa_type)
|
|
309
|
+
josa_new = fitter(josa_old)
|
|
310
|
+
self.logger.debug(
|
|
311
|
+
"Josa transformation successful",
|
|
312
|
+
original_josa=josa_old,
|
|
313
|
+
transformed_josa=josa_new,
|
|
314
|
+
)
|
|
315
|
+
except josalib.AmbiguousJosaError:
|
|
316
|
+
self.logger.debug(
|
|
317
|
+
"Ambiguous josa encountered - preserving original",
|
|
318
|
+
original_josa=josa_old,
|
|
319
|
+
fallback_label=before,
|
|
320
|
+
)
|
|
321
|
+
label, josa_new = before, josa_old
|
|
322
|
+
|
|
323
|
+
if label is None or label == after:
|
|
324
|
+
result = f"[[{after}]]{josa_new}"
|
|
325
|
+
else:
|
|
326
|
+
result = f"[[{after}|{label}]]{josa_new}"
|
|
327
|
+
|
|
328
|
+
self.logger.debug(
|
|
329
|
+
"Link replacement completed",
|
|
330
|
+
original_match=m.group(0),
|
|
331
|
+
replacement=result,
|
|
332
|
+
)
|
|
333
|
+
return result
|
|
334
|
+
|
|
335
|
+
assert False
|
|
336
|
+
|
|
337
|
+
def _edit_document(self, document: str):
|
|
338
|
+
self.logger.debug("Starting document edit", document=document)
|
|
339
|
+
self.throttler()
|
|
340
|
+
response = self.client.edit_get(document=document)
|
|
341
|
+
|
|
342
|
+
if not response.exists:
|
|
343
|
+
self._handle_nonexistent_document(document)
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
original_text = response.text
|
|
347
|
+
edited_text = self.patterns.sub(self._replace_link, original_text)
|
|
348
|
+
|
|
349
|
+
matches = list(self.patterns.finditer(original_text))
|
|
350
|
+
self.history["occurrences"] += len(matches)
|
|
351
|
+
|
|
352
|
+
if original_text == edited_text:
|
|
353
|
+
self._handle_no_changes_needed(document, matches)
|
|
354
|
+
return
|
|
355
|
+
|
|
356
|
+
self._submit_document_changes(document, original_text, edited_text, matches, response.token)
|
|
357
|
+
|
|
358
|
+
def _handle_nonexistent_document(self, document: str) -> None:
|
|
359
|
+
"""Handle case where document doesn't exist."""
|
|
360
|
+
self.logger.debug("Document does not exist - skipping edit", document=document)
|
|
361
|
+
self.history["failure"] += 1
|
|
362
|
+
|
|
363
|
+
def _handle_no_changes_needed(self, document: str, matches: list[re.Match[str]]) -> None:
|
|
364
|
+
"""Handle case where no changes are needed."""
|
|
365
|
+
self.logger.debug(
|
|
366
|
+
"No changes needed for document",
|
|
367
|
+
document=document,
|
|
368
|
+
pattern_matches=len(matches),
|
|
369
|
+
)
|
|
370
|
+
self.history["skipped"] += 1
|
|
371
|
+
|
|
372
|
+
def _submit_document_changes(
|
|
373
|
+
self,
|
|
374
|
+
document: str,
|
|
375
|
+
original_text: str,
|
|
376
|
+
edited_text: str,
|
|
377
|
+
matches: list[re.Match[str]],
|
|
378
|
+
token: str,
|
|
379
|
+
) -> None:
|
|
380
|
+
"""Submit the document changes to the API."""
|
|
381
|
+
self.logger.debug(
|
|
382
|
+
"Submitting document changes",
|
|
383
|
+
document=document,
|
|
384
|
+
pattern_matches=len(matches),
|
|
385
|
+
text_length_change=len(edited_text) - len(original_text),
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
self.throttler()
|
|
390
|
+
response = self.client.edit_post(
|
|
391
|
+
document=document,
|
|
392
|
+
body=EditPOSTBody(
|
|
393
|
+
text=edited_text,
|
|
394
|
+
log="🌳🤖",
|
|
395
|
+
token=token,
|
|
396
|
+
),
|
|
397
|
+
)
|
|
398
|
+
self._handle_successful_edit(document, response.rev)
|
|
399
|
+
except Exception as e:
|
|
400
|
+
self._handle_failed_edit(document, e)
|
|
401
|
+
|
|
402
|
+
def _handle_successful_edit(self, document: str, revision: Any) -> None:
|
|
403
|
+
"""Handle successful document edit."""
|
|
404
|
+
self.history["success"] += 1
|
|
405
|
+
self.logger.debug(
|
|
406
|
+
"Document edit completed successfully",
|
|
407
|
+
document=document,
|
|
408
|
+
revision=revision,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def _handle_failed_edit(self, document: str, error: Exception) -> None:
|
|
412
|
+
"""Handle failed document edit."""
|
|
413
|
+
self.history["failure"] += 1
|
|
414
|
+
self.logger.error(
|
|
415
|
+
"Failed to submit document edit",
|
|
416
|
+
document=document,
|
|
417
|
+
error=str(error),
|
|
418
|
+
error_type=type(error).__name__,
|
|
419
|
+
)
|
|
420
|
+
raise
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from importlib.resources import files
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from lark import Lark, Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _load_grammar() -> str:
|
|
8
|
+
return files("pynamubot.parse").joinpath("theseed.lark").read_text(encoding="utf-8")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@lru_cache(maxsize=1)
|
|
12
|
+
def get_parser() -> Lark:
|
|
13
|
+
try:
|
|
14
|
+
return Lark(_load_grammar(), parser="lalr")
|
|
15
|
+
except Exception as exc:
|
|
16
|
+
raise RuntimeError("The bundled TheSeed grammar is incomplete and cannot be loaded yet.") from exc
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_markup(text: str) -> Tree:
|
|
20
|
+
return get_parser().parse(text)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _LazyParser:
|
|
24
|
+
def parse(self, text: str) -> Tree:
|
|
25
|
+
return get_parser().parse(text)
|
|
26
|
+
|
|
27
|
+
def __getattr__(self, name: str):
|
|
28
|
+
return getattr(get_parser(), name)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
parser = _LazyParser()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
start: document
|
|
2
|
+
|
|
3
|
+
document: redirect_document | namu_mark
|
|
4
|
+
|
|
5
|
+
redirect_document: REDIRECT_MARKER internal_link
|
|
6
|
+
internal_link: ANY
|
|
7
|
+
|
|
8
|
+
namu_mark: (namu_mark_atom | NEWLINE)*
|
|
9
|
+
namu_mark_no_newline: namu_mark_atom*
|
|
10
|
+
namu_mark_atom: paragraph
|
|
11
|
+
| macro
|
|
12
|
+
| internal_link
|
|
13
|
+
| external_link
|
|
14
|
+
| file
|
|
15
|
+
| classification
|
|
16
|
+
|
|
17
|
+
paragraph: paragraph_level1_open
|
|
18
|
+
| paragraph_level2_open
|
|
19
|
+
| paragraph_level3_open
|
|
20
|
+
| paragraph_level4_open
|
|
21
|
+
| paragraph_level5_open
|
|
22
|
+
| paragraph_level6_open
|
|
23
|
+
| paragraph_level1_closed
|
|
24
|
+
| paragraph_level2_closed
|
|
25
|
+
| paragraph_level3_closed
|
|
26
|
+
| paragraph_level4_closed
|
|
27
|
+
| paragraph_level5_closed
|
|
28
|
+
| paragraph_level6_closed
|
|
29
|
+
|
|
30
|
+
paragraph_level1_open: "= " namu_mark_no_newline " ="
|
|
31
|
+
paragraph_level2_open: "== " namu_mark_no_newline " =="
|
|
32
|
+
paragraph_level3_open: "=== " namu_mark_no_newline " ==="
|
|
33
|
+
paragraph_level4_open: "==== " namu_mark_no_newline " ===="
|
|
34
|
+
paragraph_level5_open: "===== " namu_mark_no_newline " ====="
|
|
35
|
+
paragraph_level6_open: "====== " namu_mark_no_newline " ======"
|
|
36
|
+
paragraph_level1_closed: "=# " namu_mark_no_newline " #="
|
|
37
|
+
paragraph_level2_closed: "==# " namu_mark_no_newline " #=="
|
|
38
|
+
paragraph_level3_closed: "===# " namu_mark_no_newline " #==="
|
|
39
|
+
paragraph_level4_closed: "====# " namu_mark_no_newline " #===="
|
|
40
|
+
paragraph_level5_closed: "=====# " namu_mark_no_newline " #====="
|
|
41
|
+
paragraph_level6_closed: "======# " namu_mark_no_newline " #======"
|
|
42
|
+
|
|
43
|
+
macro: "[" macro_name ("(" arguments ")")? "]"
|
|
44
|
+
macro_name: "age"i
|
|
45
|
+
| "anchor"i
|
|
46
|
+
| "dday"i
|
|
47
|
+
| "include"i
|
|
48
|
+
| "youtube"i
|
|
49
|
+
| "nicovideo"i
|
|
50
|
+
| "kakaotv"i
|
|
51
|
+
| "pagecount"i
|
|
52
|
+
| "navertv"i
|
|
53
|
+
| "vimeo"i
|
|
54
|
+
| "br"i
|
|
55
|
+
| "clearfix"i
|
|
56
|
+
| "date"i
|
|
57
|
+
| "datetime"i
|
|
58
|
+
| "footnote"i
|
|
59
|
+
| "tableofcontents"i
|
|
60
|
+
| "각주"
|
|
61
|
+
| "목차"
|
|
62
|
+
|
|
63
|
+
arguments: ANY
|
|
64
|
+
|
|
65
|
+
NEWLINE: "\n"
|
|
66
|
+
REDIRECT_MARKER: "#redirect"i | "#넘겨주기"
|
|
67
|
+
EXTERNAL_LINK_PROTOCOL: "http"i
|
|
68
|
+
| "https"i
|
|
69
|
+
| "ftp"i
|
|
70
|
+
ANY: /.+/
|
|
71
|
+
_ANY: /.+/
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_driver(browser_name: str, *, headless: bool = True) -> "RemoteWebDriver":
|
|
8
|
+
browser = browser_name.lower()
|
|
9
|
+
|
|
10
|
+
if browser == "chrome":
|
|
11
|
+
from selenium.webdriver import Chrome, ChromeOptions
|
|
12
|
+
|
|
13
|
+
options = ChromeOptions()
|
|
14
|
+
if headless:
|
|
15
|
+
options.add_argument("--headless=new")
|
|
16
|
+
return Chrome(options=options)
|
|
17
|
+
|
|
18
|
+
if browser == "edge":
|
|
19
|
+
from selenium.webdriver import Edge, EdgeOptions
|
|
20
|
+
|
|
21
|
+
options = EdgeOptions()
|
|
22
|
+
if headless:
|
|
23
|
+
options.add_argument("--headless=new")
|
|
24
|
+
return Edge(options=options)
|
|
25
|
+
|
|
26
|
+
if browser == "firefox":
|
|
27
|
+
from selenium.webdriver import Firefox, FirefoxOptions
|
|
28
|
+
|
|
29
|
+
options = FirefoxOptions()
|
|
30
|
+
if headless:
|
|
31
|
+
options.add_argument("-headless")
|
|
32
|
+
return Firefox(options=options)
|
|
33
|
+
|
|
34
|
+
if browser == "safari":
|
|
35
|
+
from selenium.webdriver import Safari
|
|
36
|
+
|
|
37
|
+
return Safari()
|
|
38
|
+
|
|
39
|
+
raise ValueError(f"Unsupported browser: {browser_name}")
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NamespaceType(enum.Enum):
|
|
5
|
+
CATEGORY = "분류"
|
|
6
|
+
DOCUMENT = "문서"
|
|
7
|
+
FRAME = "틀"
|
|
8
|
+
FILE = "파일"
|
|
9
|
+
TEMPLATE = "템플릿"
|
|
10
|
+
USER = "사용자"
|
|
11
|
+
META = "알파위키"
|
|
12
|
+
TRASH = "휴지통"
|
|
13
|
+
SYSTEM = "시스템"
|
|
14
|
+
FILE_TRASH = "파일휴지통"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BacklinkType(enum.IntEnum):
|
|
18
|
+
ANY = 0
|
|
19
|
+
LINK = 1
|
|
20
|
+
FILE = 2
|
|
21
|
+
INCLUDE = 4
|
|
22
|
+
REDIRECT = 8
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NamespaceType(enum.Enum):
|
|
5
|
+
CATEGORY = "분류"
|
|
6
|
+
DOCUMENT = "문서"
|
|
7
|
+
FRAME = "틀"
|
|
8
|
+
FILE = "파일"
|
|
9
|
+
TEMPLATE = "템플릿"
|
|
10
|
+
USER = "사용자"
|
|
11
|
+
META = "나무위키"
|
|
12
|
+
TRASH = "휴지통"
|
|
13
|
+
SYSTEM = "시스템"
|
|
14
|
+
FILE_TRASH = "파일휴지통"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BacklinkType(enum.IntEnum):
|
|
18
|
+
ANY = 0
|
|
19
|
+
LINK = 1
|
|
20
|
+
FILE = 2
|
|
21
|
+
INCLUDE = 4
|
|
22
|
+
REDIRECT = 8
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NamespaceType(enum.Enum):
|
|
5
|
+
CATEGORY = "분류"
|
|
6
|
+
DOCUMENT = "문서"
|
|
7
|
+
FRAME = "틀"
|
|
8
|
+
FILE = "파일"
|
|
9
|
+
TEMPLATE = "템플릿"
|
|
10
|
+
USER = "사용자"
|
|
11
|
+
META = "더시드위키"
|
|
12
|
+
TRASH = "휴지통"
|
|
13
|
+
SYSTEM = "시스템"
|
|
14
|
+
FILE_TRASH = "파일휴지통"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BacklinkType(enum.IntEnum):
|
|
18
|
+
ANY = 0
|
|
19
|
+
LINK = 1
|
|
20
|
+
FILE = 2
|
|
21
|
+
INCLUDE = 4
|
|
22
|
+
REDIRECT = 8
|
|
File without changes
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Callable, Union
|
|
3
|
+
|
|
4
|
+
import structlog
|
|
5
|
+
|
|
6
|
+
logger = structlog.get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
Fitter = Callable[[str], str]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JosaError(Exception): ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AmbiguousJosaError(JosaError): ...
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class JosaType(Enum):
|
|
18
|
+
HAS_JONGSEONG = "HAS_JONGSEONG"
|
|
19
|
+
NO_JONGSEONG = "NO_JONGSEONG"
|
|
20
|
+
UNKNOWN = "UNKNOWN"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_HAS_TO_NO = [
|
|
24
|
+
("과", "와"),
|
|
25
|
+
("아", "야"),
|
|
26
|
+
("으로", "로"),
|
|
27
|
+
("으론", "론"),
|
|
28
|
+
("은", "는"),
|
|
29
|
+
("을", "를"),
|
|
30
|
+
("이", "가"),
|
|
31
|
+
("이다", "이다"), # No change for this case
|
|
32
|
+
("이며", "이며"), # No change for this case
|
|
33
|
+
("이면", "면"), # Remove '이' for this case
|
|
34
|
+
("이었", "였"),
|
|
35
|
+
("이고", "고"),
|
|
36
|
+
("이나", "나"),
|
|
37
|
+
("이든", "든"),
|
|
38
|
+
("이라", "라"),
|
|
39
|
+
("이란", "란"),
|
|
40
|
+
("이랑", "랑"),
|
|
41
|
+
("이셔", "셔"),
|
|
42
|
+
("이셨", "셨"),
|
|
43
|
+
("이시여", "시여"),
|
|
44
|
+
("이여", "여"),
|
|
45
|
+
("이래", "래"),
|
|
46
|
+
("이랬", "랬"),
|
|
47
|
+
("이렷", "렷"),
|
|
48
|
+
("이로", "로"),
|
|
49
|
+
("이야", "야"),
|
|
50
|
+
("아", "야"),
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
_NO_TO_HAS = [
|
|
54
|
+
("와", "과"),
|
|
55
|
+
("로", "으로"),
|
|
56
|
+
("론", "으론"),
|
|
57
|
+
("는", "은"),
|
|
58
|
+
("를", "을"),
|
|
59
|
+
("가", "이"),
|
|
60
|
+
("다", "이다"), # Add '이' for this case
|
|
61
|
+
("며", "이며"), # Add '이' for this case
|
|
62
|
+
("면", "이면"), # Add '이' for this case
|
|
63
|
+
("였", "이었"),
|
|
64
|
+
("고", "이고"),
|
|
65
|
+
("나", "이나"),
|
|
66
|
+
("든", "이든"),
|
|
67
|
+
("라", "이라"),
|
|
68
|
+
("란", "이란"),
|
|
69
|
+
("랑", "이랑"),
|
|
70
|
+
("셔", "이셔"),
|
|
71
|
+
("셨", "이셨"),
|
|
72
|
+
("시", "이시"),
|
|
73
|
+
("여", "이여"),
|
|
74
|
+
("래", "이래"),
|
|
75
|
+
("랬", "이랬"),
|
|
76
|
+
("렷", "이렷"),
|
|
77
|
+
("야", "아"), # Ambiguous: could be '아' or '이야'
|
|
78
|
+
("야", "이야"), # Ambiguous: could be '아' or '이야'
|
|
79
|
+
("로서", "으로서"),
|
|
80
|
+
("로써", "으로써"),
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_josa_type(string: str) -> JosaType:
|
|
85
|
+
logger.debug("Determining josa type", string=string)
|
|
86
|
+
|
|
87
|
+
if not string:
|
|
88
|
+
logger.debug("Empty string, returning UNKNOWN")
|
|
89
|
+
return JosaType.UNKNOWN
|
|
90
|
+
|
|
91
|
+
last = string[-1]
|
|
92
|
+
logger.debug("Last character analysis", char=last, unicode_code=ord(last))
|
|
93
|
+
|
|
94
|
+
if not "가" <= last <= "힣":
|
|
95
|
+
logger.debug("Character not in Hangul range, returning UNKNOWN")
|
|
96
|
+
return JosaType.UNKNOWN
|
|
97
|
+
|
|
98
|
+
jongseong = (ord(last) - 44032) % 28
|
|
99
|
+
|
|
100
|
+
if jongseong == 0:
|
|
101
|
+
logger.debug("No jongseong detected", char=last, result="NO_JONGSEONG")
|
|
102
|
+
return JosaType.NO_JONGSEONG
|
|
103
|
+
else:
|
|
104
|
+
logger.debug(
|
|
105
|
+
"Jongseong detected",
|
|
106
|
+
char=last,
|
|
107
|
+
jongseong_index=jongseong,
|
|
108
|
+
result="HAS_JONGSEONG",
|
|
109
|
+
)
|
|
110
|
+
return JosaType.HAS_JONGSEONG
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def to_has_jongseong(josa: str) -> str:
|
|
114
|
+
logger.debug("Converting josa to has_jongseong form", input_josa=josa)
|
|
115
|
+
old_to_new = _NO_TO_HAS
|
|
116
|
+
|
|
117
|
+
if josa == "야":
|
|
118
|
+
logger.info("Encountered ambiguous josa '야'", josa=josa)
|
|
119
|
+
raise AmbiguousJosaError()
|
|
120
|
+
|
|
121
|
+
new_josa_candidates: list[str] = []
|
|
122
|
+
for old_josa, new_josa in old_to_new:
|
|
123
|
+
if josa == old_josa:
|
|
124
|
+
logger.debug("Exact match found", old_josa=old_josa, new_josa=new_josa)
|
|
125
|
+
return new_josa
|
|
126
|
+
if josa.startswith(old_josa):
|
|
127
|
+
candidate = new_josa + josa[len(old_josa) :]
|
|
128
|
+
new_josa_candidates.append(candidate)
|
|
129
|
+
logger.debug(
|
|
130
|
+
"Partial match found",
|
|
131
|
+
old_josa=old_josa,
|
|
132
|
+
new_josa=new_josa,
|
|
133
|
+
candidate=candidate,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if not new_josa_candidates:
|
|
137
|
+
logger.debug("No matches found, returning original", result=josa)
|
|
138
|
+
return josa
|
|
139
|
+
else:
|
|
140
|
+
result = new_josa_candidates[-1]
|
|
141
|
+
logger.debug(
|
|
142
|
+
"Multiple candidates found, returning last",
|
|
143
|
+
candidates=new_josa_candidates,
|
|
144
|
+
result=result,
|
|
145
|
+
)
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def to_no_jongseong(josa: str) -> str:
|
|
150
|
+
logger.debug("Converting josa to no_jongseong form", input_josa=josa)
|
|
151
|
+
old_to_new = _HAS_TO_NO
|
|
152
|
+
|
|
153
|
+
new_josa_candidates: list[str] = []
|
|
154
|
+
for old_josa, new_josa in old_to_new:
|
|
155
|
+
if josa == old_josa:
|
|
156
|
+
logger.debug("Exact match found", old_josa=old_josa, new_josa=new_josa)
|
|
157
|
+
return new_josa
|
|
158
|
+
if josa.startswith(old_josa):
|
|
159
|
+
candidate = new_josa + josa[len(old_josa) :]
|
|
160
|
+
new_josa_candidates.append(candidate)
|
|
161
|
+
logger.debug(
|
|
162
|
+
"Partial match found",
|
|
163
|
+
old_josa=old_josa,
|
|
164
|
+
new_josa=new_josa,
|
|
165
|
+
candidate=candidate,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if not new_josa_candidates:
|
|
169
|
+
if josa.startswith("이"):
|
|
170
|
+
result = josa[1:]
|
|
171
|
+
logger.debug("Fallback: removed leading '이'", original=josa, result=result)
|
|
172
|
+
return result
|
|
173
|
+
else:
|
|
174
|
+
logger.debug("No matches found, returning original", result=josa)
|
|
175
|
+
return josa
|
|
176
|
+
else:
|
|
177
|
+
result = new_josa_candidates[-1]
|
|
178
|
+
logger.debug(
|
|
179
|
+
"Multiple candidates found, returning last",
|
|
180
|
+
candidates=new_josa_candidates,
|
|
181
|
+
result=result,
|
|
182
|
+
)
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def get_josa_fitter(string: Union[str, JosaType]) -> Fitter:
|
|
187
|
+
josa_type = string if isinstance(string, JosaType) else get_josa_type(string)
|
|
188
|
+
logger.debug("Getting josa fitter", input=string, josa_type=josa_type)
|
|
189
|
+
|
|
190
|
+
if josa_type == JosaType.UNKNOWN:
|
|
191
|
+
logger.info("Unknown josa type encountered", input=string)
|
|
192
|
+
raise ValueError("Unknown josa type")
|
|
193
|
+
elif josa_type == JosaType.HAS_JONGSEONG:
|
|
194
|
+
logger.debug("Returning to_no_jongseong fitter")
|
|
195
|
+
return to_no_jongseong
|
|
196
|
+
elif josa_type == JosaType.NO_JONGSEONG:
|
|
197
|
+
logger.debug("Returning to_has_jongseong fitter")
|
|
198
|
+
return to_has_jongseong
|
|
199
|
+
else:
|
|
200
|
+
logger.info("Unknown josa type encountered", input=string)
|
|
201
|
+
raise ValueError("Unknown josa type")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
def safe_div(numerator: float, denominator: float, safe_value: float = 0.0) -> float:
|
|
2
|
+
"""
|
|
3
|
+
Safely divides two numbers, returning a safe value if the denominator is zero.
|
|
4
|
+
|
|
5
|
+
Args:
|
|
6
|
+
numerator (float): The numerator of the division.
|
|
7
|
+
denominator (float): The denominator of the division.
|
|
8
|
+
safe_value (float, optional): The value to return if the denominator is zero. Defaults to 0.0.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
float: The result of the division or the safe value if the denominator is zero.
|
|
12
|
+
"""
|
|
13
|
+
try:
|
|
14
|
+
return numerator / denominator
|
|
15
|
+
except ZeroDivisionError:
|
|
16
|
+
return safe_value
|