web-queue2 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_queue2-0.1.0/LICENSE +21 -0
- web_queue2-0.1.0/PKG-INFO +48 -0
- web_queue2-0.1.0/README.md +11 -0
- web_queue2-0.1.0/pyproject.toml +65 -0
- web_queue2-0.1.0/web_queue/VERSION +1 -0
- web_queue2-0.1.0/web_queue/__init__.py +3 -0
- web_queue2-0.1.0/web_queue/app.py +63 -0
- web_queue2-0.1.0/web_queue/client/__init__.py +3 -0
- web_queue2-0.1.0/web_queue/client/_client.py +95 -0
- web_queue2-0.1.0/web_queue/client/ai/__init__.py +3 -0
- web_queue2-0.1.0/web_queue/client/ai/_ai.py +140 -0
- web_queue2-0.1.0/web_queue/client/clean/__init__.py +3 -0
- web_queue2-0.1.0/web_queue/client/clean/_clean.py +25 -0
- web_queue2-0.1.0/web_queue/client/config.py +61 -0
- web_queue2-0.1.0/web_queue/client/web/__init__.py +3 -0
- web_queue2-0.1.0/web_queue/client/web/_web.py +175 -0
- web_queue2-0.1.0/web_queue/config.py +33 -0
- web_queue2-0.1.0/web_queue/types/__init__.py +0 -0
- web_queue2-0.1.0/web_queue/types/fetch_html_message.py +26 -0
- web_queue2-0.1.0/web_queue/types/html_content.py +18 -0
- web_queue2-0.1.0/web_queue/types/html_metadata_response.py +82 -0
- web_queue2-0.1.0/web_queue/types/message.py +34 -0
- web_queue2-0.1.0/web_queue/utils/__init__.py +0 -0
- web_queue2-0.1.0/web_queue/utils/compression.py +29 -0
- web_queue2-0.1.0/web_queue/utils/html_cleaner.py +145 -0
- web_queue2-0.1.0/web_queue/utils/html_to_str.py +21 -0
- web_queue2-0.1.0/web_queue/utils/human_delay.py +11 -0
- web_queue2-0.1.0/web_queue/utils/page_with_init_script.py +15 -0
- web_queue2-0.1.0/web_queue/utils/simulate_mouse_circling.py +49 -0
- web_queue2-0.1.0/web_queue/utils/simulate_scrolling.py +18 -0
web_queue2-0.1.0/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 AllenChou
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,48 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: web-queue2
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Get web content from queue.
|
5
|
+
License: MIT
|
6
|
+
License-File: LICENSE
|
7
|
+
Author: Allen Chou
|
8
|
+
Author-email: f1470891079@gmail.com
|
9
|
+
Requires-Python: >=3.11,<4
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
16
|
+
Requires-Dist: bs4
|
17
|
+
Requires-Dist: cachetic
|
18
|
+
Requires-Dist: dictpress
|
19
|
+
Requires-Dist: fastapi
|
20
|
+
Requires-Dist: httpx
|
21
|
+
Requires-Dist: huey
|
22
|
+
Requires-Dist: logfire
|
23
|
+
Requires-Dist: logging_bullet_train
|
24
|
+
Requires-Dist: openai (>=1,<2)
|
25
|
+
Requires-Dist: openai-agents (>=0.1.0,<1.0.0)
|
26
|
+
Requires-Dist: playwright
|
27
|
+
Requires-Dist: pydantic (>=2)
|
28
|
+
Requires-Dist: pydantic-settings
|
29
|
+
Requires-Dist: str-or-none
|
30
|
+
Requires-Dist: tiktoken
|
31
|
+
Requires-Dist: yarl
|
32
|
+
Project-URL: Homepage, https://github.com/allen2c/web-queue
|
33
|
+
Project-URL: PyPI, https://pypi.org/project/web-queue/
|
34
|
+
Project-URL: Repository, https://github.com/allen2c/web-queue
|
35
|
+
Description-Content-Type: text/markdown
|
36
|
+
|
37
|
+
# Web-Queue
|
38
|
+
|
39
|
+
[](https://pypi.org/project/web-queue/)
|
40
|
+
[](https://pypi.org/project/web-queue/)
|
41
|
+
[](https://opensource.org/licenses/MIT)
|
42
|
+
|
43
|
+
A web content pipeline library.
|
44
|
+
|
45
|
+
## License
|
46
|
+
|
47
|
+
MIT License
|
48
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# Web-Queue
|
2
|
+
|
3
|
+
[](https://pypi.org/project/web-queue/)
|
4
|
+
[](https://pypi.org/project/web-queue/)
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
6
|
+
|
7
|
+
A web content pipeline library.
|
8
|
+
|
9
|
+
## License
|
10
|
+
|
11
|
+
MIT License
|
@@ -0,0 +1,65 @@
|
|
1
|
+
[project]
|
2
|
+
authors = [{ name = "Allen Chou", email = "f1470891079@gmail.com" }]
|
3
|
+
dependencies = [
|
4
|
+
"bs4",
|
5
|
+
"cachetic",
|
6
|
+
"dictpress",
|
7
|
+
"fastapi",
|
8
|
+
"httpx",
|
9
|
+
"huey",
|
10
|
+
"logfire",
|
11
|
+
"logging_bullet_train",
|
12
|
+
"openai (>=1,<2)",
|
13
|
+
"openai-agents (>=0.1.0,<1.0.0)",
|
14
|
+
"playwright",
|
15
|
+
"pydantic (>=2)",
|
16
|
+
"pydantic-settings",
|
17
|
+
"str-or-none",
|
18
|
+
"tiktoken",
|
19
|
+
"yarl",
|
20
|
+
]
|
21
|
+
description = "Get web content from queue."
|
22
|
+
license = { text = "MIT" }
|
23
|
+
name = "web-queue2"
|
24
|
+
readme = "README.md"
|
25
|
+
requires-python = ">=3.11,<4"
|
26
|
+
version = "0.1.0"
|
27
|
+
|
28
|
+
[project.urls]
|
29
|
+
Homepage = "https://github.com/allen2c/web-queue"
|
30
|
+
"PyPI" = "https://pypi.org/project/web-queue/"
|
31
|
+
Repository = "https://github.com/allen2c/web-queue"
|
32
|
+
|
33
|
+
[tool.poetry]
|
34
|
+
packages = [{ include = "web_queue" }]
|
35
|
+
|
36
|
+
[tool.poetry.extras]
|
37
|
+
all = []
|
38
|
+
|
39
|
+
[tool.poetry.group.dev.dependencies]
|
40
|
+
black = { extras = ["jupyter"], version = "*" }
|
41
|
+
isort = "*"
|
42
|
+
poetry-plugin-export = "*"
|
43
|
+
pytest = "*"
|
44
|
+
pytest-asyncio = "*"
|
45
|
+
pytest-cov = "*"
|
46
|
+
pytest-env = "*"
|
47
|
+
pytest-xdist = "*"
|
48
|
+
rich = "*"
|
49
|
+
rich-color-support = "*"
|
50
|
+
setuptools = "*"
|
51
|
+
twine = "*"
|
52
|
+
|
53
|
+
[tool.isort]
|
54
|
+
profile = "black"
|
55
|
+
|
56
|
+
[tool.flake8]
|
57
|
+
ignore = ["E203", "E704", "W503"]
|
58
|
+
max-line-length = 88
|
59
|
+
|
60
|
+
[tool.pytest.ini_options]
|
61
|
+
env = ["ENVIRONMENT=test", "PYTEST_IS_RUNNING=true"]
|
62
|
+
|
63
|
+
[build-system]
|
64
|
+
build-backend = "poetry.core.masonry.api"
|
65
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import typing
|
4
|
+
|
5
|
+
import huey
|
6
|
+
import logfire
|
7
|
+
import logging_bullet_train as lbt
|
8
|
+
from huey.api import Task
|
9
|
+
|
10
|
+
import web_queue.config
|
11
|
+
|
12
|
+
if typing.TYPE_CHECKING:
|
13
|
+
from web_queue.client import WebQueueClient
|
14
|
+
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
15
|
+
from web_queue.types.html_content import HTMLContent
|
16
|
+
|
17
|
+
lbt.set_logger("web_queue")
|
18
|
+
|
19
|
+
logfire.configure()
|
20
|
+
logfire.instrument_openai()
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
logger.info("Web queue app starting...")
|
25
|
+
|
26
|
+
web_queue_settings = web_queue.config.Settings()
|
27
|
+
logger.info(f"Web queue connecting to redis: {web_queue_settings.web_queue_safe_url}")
|
28
|
+
|
29
|
+
huey_app = huey.RedisExpireHuey(
|
30
|
+
web_queue_settings.WEB_QUEUE_NAME,
|
31
|
+
url=web_queue_settings.WEB_QUEUE_URL.get_secret_value(),
|
32
|
+
expire_time=24 * 60 * 60, # 24 hours
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
@huey_app.task(
|
37
|
+
retries=1,
|
38
|
+
retry_delay=8,
|
39
|
+
expires=24 * 60 * 60,
|
40
|
+
context=True,
|
41
|
+
)
|
42
|
+
def fetch_html(
|
43
|
+
message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
|
44
|
+
) -> str:
|
45
|
+
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
46
|
+
|
47
|
+
message = FetchHTMLMessage.from_any(message)
|
48
|
+
message.id = task.id
|
49
|
+
|
50
|
+
logger.info(f"Fetching HTML from {message.data.url}")
|
51
|
+
|
52
|
+
loop = asyncio.new_event_loop()
|
53
|
+
asyncio.set_event_loop(loop)
|
54
|
+
|
55
|
+
try:
|
56
|
+
wq_client: "WebQueueClient" = web_queue_settings.web_queue_client
|
57
|
+
html_content: "HTMLContent" = loop.run_until_complete(
|
58
|
+
wq_client.fetch(**message.data.model_dump())
|
59
|
+
)
|
60
|
+
return html_content.model_dump_json()
|
61
|
+
|
62
|
+
finally:
|
63
|
+
loop.close()
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import functools
|
2
|
+
import typing
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
import yarl
|
6
|
+
|
7
|
+
if typing.TYPE_CHECKING:
|
8
|
+
from web_queue.client.ai import AI
|
9
|
+
from web_queue.client.clean import Clean
|
10
|
+
from web_queue.client.config import Settings
|
11
|
+
from web_queue.client.web import Web
|
12
|
+
from web_queue.types.html_content import HTMLContent
|
13
|
+
|
14
|
+
|
15
|
+
class WebQueueClient:
|
16
|
+
def __init__(self, settings: typing.Optional["Settings"] = None):
|
17
|
+
from web_queue.client.config import Settings
|
18
|
+
|
19
|
+
self.settings = settings or Settings()
|
20
|
+
|
21
|
+
@functools.cached_property
|
22
|
+
def web(self) -> "Web":
|
23
|
+
from web_queue.client.web import Web
|
24
|
+
|
25
|
+
return Web(self)
|
26
|
+
|
27
|
+
@functools.cached_property
|
28
|
+
def clean(self) -> "Clean":
|
29
|
+
from web_queue.client.clean import Clean
|
30
|
+
|
31
|
+
return Clean(self)
|
32
|
+
|
33
|
+
@functools.cached_property
|
34
|
+
def ai(self) -> "AI":
|
35
|
+
from web_queue.client.ai import AI
|
36
|
+
|
37
|
+
return AI(self)
|
38
|
+
|
39
|
+
async def fetch(
|
40
|
+
self,
|
41
|
+
url: yarl.URL | httpx.URL | str,
|
42
|
+
*,
|
43
|
+
headless: bool = False,
|
44
|
+
goto_timeout: int = 4000, # 4 seconds
|
45
|
+
circling_times: int = 2,
|
46
|
+
scrolling_times: int = 3,
|
47
|
+
human_delay_base_delay: float = 1.2,
|
48
|
+
dynamic_content_loading_delay: float = 2.0,
|
49
|
+
) -> "HTMLContent":
|
50
|
+
from web_queue.types.html_content import HTMLContent
|
51
|
+
from web_queue.utils.html_to_str import htmls_to_str
|
52
|
+
|
53
|
+
# Fetch HTML
|
54
|
+
html = await self.web.fetch(
|
55
|
+
url,
|
56
|
+
headless=headless,
|
57
|
+
goto_timeout=goto_timeout,
|
58
|
+
circling_times=circling_times,
|
59
|
+
scrolling_times=scrolling_times,
|
60
|
+
human_delay_base_delay=human_delay_base_delay,
|
61
|
+
dynamic_content_loading_delay=dynamic_content_loading_delay,
|
62
|
+
)
|
63
|
+
|
64
|
+
# Clean HTML
|
65
|
+
html = self.clean.as_main_content(html)
|
66
|
+
|
67
|
+
# Extract content metadata
|
68
|
+
html_metadata = await self.ai.as_html_metadata(html)
|
69
|
+
|
70
|
+
if not html_metadata:
|
71
|
+
raise ValueError(f"Failed to retrieve content metadata for url: {url}")
|
72
|
+
|
73
|
+
# Extract content body
|
74
|
+
content_body_htmls = html.select(html_metadata.content_body_css_selector)
|
75
|
+
if not content_body_htmls:
|
76
|
+
raise ValueError(
|
77
|
+
"Failed to retrieve content body by css selector "
|
78
|
+
+ f"'{html_metadata.content_body_css_selector}' "
|
79
|
+
+ f"for url: '{url}'"
|
80
|
+
)
|
81
|
+
|
82
|
+
content_body_text = htmls_to_str(content_body_htmls)
|
83
|
+
|
84
|
+
html_content = HTMLContent(
|
85
|
+
title=html_metadata.title,
|
86
|
+
author=html_metadata.author,
|
87
|
+
chapter_id=html_metadata.chapter_id,
|
88
|
+
chapter_number=html_metadata.chapter_number,
|
89
|
+
content=content_body_text,
|
90
|
+
created_date=html_metadata.created_date,
|
91
|
+
updated_date=html_metadata.updated_date,
|
92
|
+
)
|
93
|
+
|
94
|
+
html_content._html = str(html)
|
95
|
+
return html_content
|
@@ -0,0 +1,140 @@
|
|
1
|
+
import asyncio
|
2
|
+
import datetime
|
3
|
+
import hashlib
|
4
|
+
import logging
|
5
|
+
import textwrap
|
6
|
+
import typing
|
7
|
+
import zoneinfo
|
8
|
+
|
9
|
+
import logfire
|
10
|
+
from rich.pretty import pretty_repr
|
11
|
+
|
12
|
+
from web_queue.client import WebQueueClient
|
13
|
+
from web_queue.types.html_metadata_response import HTMLMetadataResponse
|
14
|
+
from web_queue.utils.compression import compress, decompress
|
15
|
+
|
16
|
+
if typing.TYPE_CHECKING:
|
17
|
+
import bs4
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class AI:
|
23
|
+
def __init__(self, client: WebQueueClient):
|
24
|
+
self.client = client
|
25
|
+
|
26
|
+
@logfire.instrument
|
27
|
+
async def as_html_metadata(
|
28
|
+
self, html: typing.Union["bs4.BeautifulSoup", typing.Text]
|
29
|
+
) -> typing.Optional[HTMLMetadataResponse]:
|
30
|
+
"""Extract content metadata and CSS selector from HTML.
|
31
|
+
|
32
|
+
Analyzes HTML to find content body selector and extract metadata values.
|
33
|
+
"""
|
34
|
+
openai_client = self.client.settings.openai_client
|
35
|
+
model_name = self.client.settings.OPENAI_MODEL
|
36
|
+
|
37
|
+
html = str(html)
|
38
|
+
|
39
|
+
logger.info(f"AI is extracting content metadata from HTML: {html}")
|
40
|
+
|
41
|
+
cache_key = (
|
42
|
+
"retrieve_html_content_metadata:"
|
43
|
+
+ f"{hashlib.md5(html.encode('utf-8')).hexdigest()}"
|
44
|
+
)
|
45
|
+
|
46
|
+
might_cached_data: typing.Text | None = await asyncio.to_thread(
|
47
|
+
self.client.settings.compressed_base64_cache.get, cache_key
|
48
|
+
)
|
49
|
+
if might_cached_data is not None:
|
50
|
+
logger.debug(
|
51
|
+
"Hit cache 'as_html_content_metadata':"
|
52
|
+
+ f"{pretty_repr(html, max_string=32)}"
|
53
|
+
)
|
54
|
+
return HTMLMetadataResponse.model_validate_json(
|
55
|
+
decompress(might_cached_data)
|
56
|
+
)
|
57
|
+
|
58
|
+
# Get current time in Asia/Taipei timezone for relative date parsing
|
59
|
+
current_time = datetime.datetime.now(zoneinfo.ZoneInfo("Asia/Taipei"))
|
60
|
+
current_time_iso = current_time.isoformat()
|
61
|
+
|
62
|
+
system_prompt = textwrap.dedent(
|
63
|
+
f"""
|
64
|
+
You are an HTML structure analysis expert. Task: From the provided HTML, extract content metadata and identify CSS selectors.
|
65
|
+
|
66
|
+
Current time (Asia/Taipei timezone): {current_time_iso}
|
67
|
+
|
68
|
+
Instructions:
|
69
|
+
1. **content_body_css_selector**: Find the CSS selector for the main content body element containing ONLY the article text.
|
70
|
+
- Look for semantic tags like <article>, <main>, or <div> with classes/IDs like 'body', 'content', 'text', 'novel-body'.
|
71
|
+
- EXCLUDE elements containing metadata (title, author, dates, navigation, footer, ads, comments).
|
72
|
+
- Example: 'div.article-body', 'div#novel-content', 'div.p-novel__text'.
|
73
|
+
- Return empty string if not found.
|
74
|
+
|
75
|
+
2. **title**: Extract the actual title text (chapter title, article title).
|
76
|
+
- Look in <h1>, <h2>, or elements with class/id containing 'title', 'heading'.
|
77
|
+
- Return the text content, not the CSS selector.
|
78
|
+
- Return empty string if not found.
|
79
|
+
|
80
|
+
3. **author**: Extract the actual author name or username.
|
81
|
+
- Look in elements with class/id containing 'author', 'writer', 'username'.
|
82
|
+
- Return the text content.
|
83
|
+
- Return empty string if not found.
|
84
|
+
|
85
|
+
4. **chapter_id**: Extract the actual chapter identifier (e.g., '12345', 'ch-001').
|
86
|
+
- Look in data attributes, URLs, or element IDs.
|
87
|
+
- Return empty string if not found.
|
88
|
+
|
89
|
+
5. **chapter_number**: Extract the actual chapter number (e.g., '1', '42', 'Chapter 5').
|
90
|
+
- Return empty string if not found.
|
91
|
+
|
92
|
+
6. **created_date** and **updated_date**: Parse dates to ISO 8601 format with +08:00 timezone.
|
93
|
+
- For absolute dates: Convert to 'YYYY-MM-DDTHH:MM:SS+08:00' format.
|
94
|
+
- For relative dates ('2 days ago', '3 hours ago'): Calculate from current_time and format.
|
95
|
+
- Return empty string if not found.
|
96
|
+
|
97
|
+
Rules:
|
98
|
+
- If any field is not found or unclear, return empty string "".
|
99
|
+
- Do not guess or make up information.
|
100
|
+
- Focus on precision and accuracy.
|
101
|
+
|
102
|
+
Now, analyze the provided HTML and extract all available metadata.
|
103
|
+
""" # noqa: E501
|
104
|
+
).strip()
|
105
|
+
|
106
|
+
try:
|
107
|
+
parsed_cmpl = await openai_client.chat.completions.parse(
|
108
|
+
messages=[
|
109
|
+
{"role": "system", "content": system_prompt},
|
110
|
+
{"role": "user", "content": html},
|
111
|
+
],
|
112
|
+
model=model_name,
|
113
|
+
response_format=HTMLMetadataResponse,
|
114
|
+
)
|
115
|
+
response_msg = parsed_cmpl.choices[0].message
|
116
|
+
if response_msg.refusal:
|
117
|
+
logger.error(f"LLM refusal: {response_msg.refusal}")
|
118
|
+
return None
|
119
|
+
|
120
|
+
elif response_msg.parsed:
|
121
|
+
output: HTMLMetadataResponse = response_msg.parsed
|
122
|
+
output._html = html
|
123
|
+
logger.info(f"LLM response: {output}")
|
124
|
+
|
125
|
+
# Cache the response
|
126
|
+
await asyncio.to_thread(
|
127
|
+
self.client.settings.compressed_base64_cache.set,
|
128
|
+
cache_key,
|
129
|
+
compress(output.model_dump_json()),
|
130
|
+
)
|
131
|
+
|
132
|
+
return output
|
133
|
+
|
134
|
+
else:
|
135
|
+
logger.error(f"LLM Error for message: {response_msg}")
|
136
|
+
return None
|
137
|
+
|
138
|
+
except Exception as e:
|
139
|
+
logger.error(f"Parsing failed: {e}")
|
140
|
+
return None
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import logging
|
2
|
+
import typing
|
3
|
+
|
4
|
+
import bs4
|
5
|
+
|
6
|
+
from web_queue.client import WebQueueClient
|
7
|
+
from web_queue.utils.html_cleaner import HTMLCleaner
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class Clean:
|
13
|
+
def __init__(self, client: WebQueueClient):
|
14
|
+
self.client = client
|
15
|
+
|
16
|
+
def as_main_content(self, html: bs4.BeautifulSoup | str) -> bs4.BeautifulSoup:
|
17
|
+
html = (
|
18
|
+
bs4.BeautifulSoup(html, "html.parser")
|
19
|
+
if isinstance(html, typing.Text)
|
20
|
+
else html
|
21
|
+
)
|
22
|
+
|
23
|
+
logger.info(f"Cleaning HTML: {html}")
|
24
|
+
cleaned_html = HTMLCleaner.clean_as_main_content_html_str(html)
|
25
|
+
return bs4.BeautifulSoup(cleaned_html, "html.parser")
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import functools
|
2
|
+
import pathlib
|
3
|
+
import typing
|
4
|
+
|
5
|
+
import cachetic
|
6
|
+
import openai
|
7
|
+
import pydantic as pydantic
|
8
|
+
import pydantic_settings
|
9
|
+
|
10
|
+
|
11
|
+
class Settings(pydantic_settings.BaseSettings):
|
12
|
+
OPENAI_MODEL: str = pydantic.Field(default="gpt-4.1-nano")
|
13
|
+
OPENAI_API_KEY: pydantic.SecretStr = pydantic.SecretStr("")
|
14
|
+
|
15
|
+
# Cache
|
16
|
+
WEB_CACHE_PATH: typing.Text = pydantic.Field(default="./.cache/web.cache")
|
17
|
+
WEB_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
|
18
|
+
WEB_SCREENSHOT_PATH: typing.Text = pydantic.Field(default="./data/screenshots")
|
19
|
+
WEB_PDF_PATH: typing.Text = pydantic.Field(default="./data/pdfs")
|
20
|
+
COMPRESSED_BASE64_CACHE_PATH: typing.Text = pydantic.Field(
|
21
|
+
default="./.cache/compressed_base64.cache"
|
22
|
+
)
|
23
|
+
COMPRESSED_BASE64_CACHE_EXPIRE_SECONDS: int = pydantic.Field(
|
24
|
+
default=60 * 60 * 24
|
25
|
+
) # 1 day
|
26
|
+
|
27
|
+
@functools.cached_property
|
28
|
+
def openai_client(self) -> openai.AsyncOpenAI:
|
29
|
+
return openai.AsyncOpenAI(api_key=self.OPENAI_API_KEY.get_secret_value())
|
30
|
+
|
31
|
+
@functools.cached_property
|
32
|
+
def web_cache(self) -> "cachetic.Cachetic[typing.Text]":
|
33
|
+
import cachetic
|
34
|
+
|
35
|
+
return cachetic.Cachetic(
|
36
|
+
object_type=pydantic.TypeAdapter(typing.Text),
|
37
|
+
cache_url=pathlib.Path(self.WEB_CACHE_PATH),
|
38
|
+
default_ttl=self.WEB_CACHE_EXPIRE_SECONDS,
|
39
|
+
)
|
40
|
+
|
41
|
+
@functools.cached_property
|
42
|
+
def compressed_base64_cache(self) -> "cachetic.Cachetic[typing.Text]":
|
43
|
+
import cachetic
|
44
|
+
|
45
|
+
return cachetic.Cachetic(
|
46
|
+
object_type=pydantic.TypeAdapter(typing.Text),
|
47
|
+
cache_url=pathlib.Path(self.COMPRESSED_BASE64_CACHE_PATH),
|
48
|
+
default_ttl=self.COMPRESSED_BASE64_CACHE_EXPIRE_SECONDS,
|
49
|
+
)
|
50
|
+
|
51
|
+
@property
|
52
|
+
def web_screenshot_path(self) -> pathlib.Path:
|
53
|
+
_path = pathlib.Path(self.WEB_SCREENSHOT_PATH)
|
54
|
+
_path.mkdir(parents=True, exist_ok=True)
|
55
|
+
return _path
|
56
|
+
|
57
|
+
@property
|
58
|
+
def web_pdf_path(self) -> pathlib.Path:
|
59
|
+
_path = pathlib.Path(self.WEB_PDF_PATH)
|
60
|
+
_path.mkdir(parents=True, exist_ok=True)
|
61
|
+
return _path
|
@@ -0,0 +1,175 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import secrets
|
4
|
+
import time
|
5
|
+
import typing
|
6
|
+
|
7
|
+
import bs4
|
8
|
+
import fastapi
|
9
|
+
import httpx
|
10
|
+
import yarl
|
11
|
+
from playwright._impl._api_structures import ViewportSize
|
12
|
+
from playwright.async_api import async_playwright
|
13
|
+
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
14
|
+
from str_or_none import str_or_none
|
15
|
+
|
16
|
+
from web_queue.client import WebQueueClient
|
17
|
+
from web_queue.utils.compression import compress, decompress
|
18
|
+
from web_queue.utils.human_delay import human_delay
|
19
|
+
from web_queue.utils.page_with_init_script import page_with_init_script
|
20
|
+
from web_queue.utils.simulate_mouse_circling import simulate_mouse_circling
|
21
|
+
from web_queue.utils.simulate_scrolling import simulate_scrolling
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class Web:
|
27
|
+
USER_AGENTS: typing.ClassVar[typing.Tuple[typing.Text, ...]] = (
|
28
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", # noqa: E501
|
29
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", # noqa: E501
|
30
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", # noqa: E501
|
31
|
+
)
|
32
|
+
VIEWPORT_SIZES: typing.ClassVar[typing.Tuple[typing.Tuple[int, int], ...]] = (
|
33
|
+
(1920, 1080),
|
34
|
+
(1366, 768),
|
35
|
+
(1440, 900),
|
36
|
+
)
|
37
|
+
|
38
|
+
def __init__(self, client: WebQueueClient):
|
39
|
+
self.client = client
|
40
|
+
|
41
|
+
async def fetch(
|
42
|
+
self,
|
43
|
+
url: typing.Text | yarl.URL | httpx.URL,
|
44
|
+
*,
|
45
|
+
headless: bool = True,
|
46
|
+
goto_timeout: int = 4000, # 4 seconds
|
47
|
+
circling_times: int = 3,
|
48
|
+
scrolling_times: int = 3,
|
49
|
+
human_delay_base_delay: float = 1.2,
|
50
|
+
dynamic_content_loading_delay: float = 2.0,
|
51
|
+
) -> bs4.BeautifulSoup:
|
52
|
+
_url = str_or_none(str(url))
|
53
|
+
if not _url:
|
54
|
+
raise fastapi.exceptions.HTTPException(status_code=400, detail="Empty URL")
|
55
|
+
|
56
|
+
html_content: typing.Text | None = None
|
57
|
+
h_delay = human_delay_base_delay
|
58
|
+
d_delay = dynamic_content_loading_delay
|
59
|
+
|
60
|
+
logger.info(f"Browser is fetching {_url}")
|
61
|
+
maybe_html_content = self.client.settings.web_cache.get(_url)
|
62
|
+
if maybe_html_content:
|
63
|
+
logger.debug(f"Hit web cache for {_url}")
|
64
|
+
html_content = await asyncio.to_thread(
|
65
|
+
decompress, maybe_html_content, format="zstd"
|
66
|
+
)
|
67
|
+
return bs4.BeautifulSoup(html_content, "html.parser")
|
68
|
+
|
69
|
+
async with async_playwright() as p:
|
70
|
+
browser = await p.chromium.launch(
|
71
|
+
headless=headless,
|
72
|
+
args=[
|
73
|
+
"--no-sandbox",
|
74
|
+
"--disable-blink-features=AutomationControlled",
|
75
|
+
"--disable-dev-shm-usage",
|
76
|
+
"--disable-web-security",
|
77
|
+
"--disable-features=VizDisplayCompositor",
|
78
|
+
],
|
79
|
+
)
|
80
|
+
|
81
|
+
# Create context
|
82
|
+
_viewport_size = secrets.choice(self.VIEWPORT_SIZES)
|
83
|
+
_viewport = ViewportSize(width=_viewport_size[0], height=_viewport_size[1])
|
84
|
+
context = await browser.new_context(
|
85
|
+
user_agent=secrets.choice(self.USER_AGENTS),
|
86
|
+
viewport=_viewport,
|
87
|
+
locale="en-US",
|
88
|
+
timezone_id="Asia/Tokyo",
|
89
|
+
permissions=["geolocation"],
|
90
|
+
extra_http_headers={
|
91
|
+
"Accept-Language": "en-US,en;q=0.9,ja;q=0.8",
|
92
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", # noqa: E501
|
93
|
+
"Accept-Encoding": "gzip, deflate, br",
|
94
|
+
"Accept-Charset": "utf-8",
|
95
|
+
},
|
96
|
+
)
|
97
|
+
|
98
|
+
# Create new page
|
99
|
+
page = await context.new_page()
|
100
|
+
|
101
|
+
# Inject script to hide automation features
|
102
|
+
page = await page_with_init_script(page)
|
103
|
+
|
104
|
+
try:
|
105
|
+
# Navigate to URL
|
106
|
+
logger.debug(f"Navigating (timeout: {goto_timeout}ms) to {_url}")
|
107
|
+
try:
|
108
|
+
await page.goto(
|
109
|
+
_url, wait_until="domcontentloaded", timeout=goto_timeout
|
110
|
+
) # Wait for network idle
|
111
|
+
except PlaywrightTimeoutError:
|
112
|
+
logger.info(f"Timeout for goto '{_url}', continuing...")
|
113
|
+
await human_delay(h_delay) # Initial delay
|
114
|
+
|
115
|
+
# Wait for full page load (additional checks)
|
116
|
+
logger.debug(f"Waiting {h_delay}s for full page load")
|
117
|
+
await page.wait_for_load_state("domcontentloaded")
|
118
|
+
await human_delay(h_delay)
|
119
|
+
|
120
|
+
# Simulate smooth mouse circling three times
|
121
|
+
start_position = None
|
122
|
+
for i in range(circling_times):
|
123
|
+
logger.debug(f"Simulating mouse circling {i+1} of {circling_times}")
|
124
|
+
start_position = await simulate_mouse_circling(
|
125
|
+
page, _viewport, start_position=start_position
|
126
|
+
)
|
127
|
+
await human_delay(h_delay)
|
128
|
+
|
129
|
+
# Simulate scrolling three times
|
130
|
+
for i in range(scrolling_times):
|
131
|
+
logger.debug(f"Simulating scrolling {i+1} of {scrolling_times}")
|
132
|
+
await simulate_scrolling(page, scroll_direction="down")
|
133
|
+
await human_delay(h_delay)
|
134
|
+
|
135
|
+
# Extra delay for dynamic content loading
|
136
|
+
logger.debug(f"Delaying {d_delay}s for dynamic content loading")
|
137
|
+
await human_delay(d_delay)
|
138
|
+
|
139
|
+
# Get full HTML content
|
140
|
+
html_content = await page.content()
|
141
|
+
html_content = str_or_none(html_content)
|
142
|
+
html_content_size = len(html_content or " ")
|
143
|
+
|
144
|
+
logger.info(
|
145
|
+
f"Fetched HTML content size: {html_content_size} for {_url}"
|
146
|
+
)
|
147
|
+
|
148
|
+
# Screenshot and PDF
|
149
|
+
snapshot_filename = f"{int(time.time()*1E3)}_{secrets.token_hex(2)}"
|
150
|
+
screenshot_path = self.client.settings.web_screenshot_path.joinpath(
|
151
|
+
f"{snapshot_filename}.png"
|
152
|
+
)
|
153
|
+
screenshot_path.write_bytes(await page.screenshot())
|
154
|
+
logger.info(f"Screenshot saved to {screenshot_path}")
|
155
|
+
pdf_path = self.client.settings.web_pdf_path.joinpath(
|
156
|
+
f"{snapshot_filename}.pdf"
|
157
|
+
)
|
158
|
+
await page.pdf(path=pdf_path, print_background=True)
|
159
|
+
logger.info(f"PDF saved to {pdf_path}")
|
160
|
+
|
161
|
+
finally:
|
162
|
+
await browser.close()
|
163
|
+
|
164
|
+
if not html_content:
|
165
|
+
raise fastapi.exceptions.HTTPException(
|
166
|
+
status_code=500, detail="Failed to fetch content"
|
167
|
+
)
|
168
|
+
|
169
|
+
await asyncio.to_thread(
|
170
|
+
self.client.settings.web_cache.set,
|
171
|
+
_url,
|
172
|
+
compress(html_content, format="zstd"),
|
173
|
+
)
|
174
|
+
|
175
|
+
return bs4.BeautifulSoup(html_content, "html.parser")
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import functools
|
2
|
+
import typing
|
3
|
+
|
4
|
+
import pydantic
|
5
|
+
import pydantic_settings
|
6
|
+
import yarl
|
7
|
+
from str_or_none import str_or_none
|
8
|
+
|
9
|
+
if typing.TYPE_CHECKING:
|
10
|
+
from web_queue.client import WebQueueClient
|
11
|
+
|
12
|
+
|
13
|
+
class Settings(pydantic_settings.BaseSettings):
|
14
|
+
WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
|
15
|
+
WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
|
16
|
+
|
17
|
+
@pydantic.model_validator(mode="after")
|
18
|
+
def validate_values(self) -> typing.Self:
|
19
|
+
if str_or_none(self.WEB_QUEUE_NAME) is None:
|
20
|
+
raise ValueError("WEB_QUEUE_NAME is required")
|
21
|
+
if str_or_none(self.WEB_QUEUE_URL.get_secret_value()) is None:
|
22
|
+
raise ValueError("WEB_QUEUE_URL is required")
|
23
|
+
return self
|
24
|
+
|
25
|
+
@functools.cached_property
|
26
|
+
def web_queue_client(self) -> "WebQueueClient":
|
27
|
+
from web_queue.client import WebQueueClient
|
28
|
+
|
29
|
+
return WebQueueClient()
|
30
|
+
|
31
|
+
@property
|
32
|
+
def web_queue_safe_url(self) -> str:
|
33
|
+
return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
|
File without changes
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import typing
|
2
|
+
|
3
|
+
import pydantic
|
4
|
+
from str_or_none import str_or_none
|
5
|
+
|
6
|
+
from web_queue.types.message import Message
|
7
|
+
|
8
|
+
|
9
|
+
class FetchHTMLMessageRequest(pydantic.BaseModel):
|
10
|
+
url: str
|
11
|
+
headless: bool = False
|
12
|
+
goto_timeout: int = 4000
|
13
|
+
circling_times: int = 2
|
14
|
+
scrolling_times: int = 3
|
15
|
+
human_delay_base_delay: float = 1.2
|
16
|
+
dynamic_content_loading_delay: float = 2
|
17
|
+
|
18
|
+
@pydantic.model_validator(mode="after")
|
19
|
+
def validate_url(self) -> typing.Self:
|
20
|
+
if not str_or_none(self.url):
|
21
|
+
raise ValueError("URL is required")
|
22
|
+
return self
|
23
|
+
|
24
|
+
|
25
|
+
class FetchHTMLMessage(Message):
|
26
|
+
data: FetchHTMLMessageRequest
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import pydantic
|
4
|
+
|
5
|
+
logger = logging.getLogger(__name__)
|
6
|
+
|
7
|
+
|
8
|
+
class HTMLContent(pydantic.BaseModel):
|
9
|
+
title: str = pydantic.Field(default="")
|
10
|
+
author: str = pydantic.Field(default="")
|
11
|
+
chapter_id: str = pydantic.Field(default="")
|
12
|
+
chapter_number: str = pydantic.Field(default="")
|
13
|
+
content: str = pydantic.Field(default="")
|
14
|
+
created_date: str = pydantic.Field(default="")
|
15
|
+
updated_date: str = pydantic.Field(default="")
|
16
|
+
|
17
|
+
# Private attributes
|
18
|
+
_html: str = pydantic.PrivateAttr(default="")
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import pydantic
|
4
|
+
|
5
|
+
logger = logging.getLogger(__name__)
|
6
|
+
|
7
|
+
|
8
|
+
class HTMLMetadataResponse(pydantic.BaseModel):
|
9
|
+
"""Structured response for HTML content metadata and element locators.
|
10
|
+
|
11
|
+
Extracts content body CSS selector and metadata values.
|
12
|
+
"""
|
13
|
+
|
14
|
+
title: str = pydantic.Field(
|
15
|
+
default="",
|
16
|
+
description=(
|
17
|
+
"The actual title text of the content "
|
18
|
+
"(e.g., chapter title, article title). "
|
19
|
+
"Return empty string if not found."
|
20
|
+
),
|
21
|
+
)
|
22
|
+
|
23
|
+
author: str = pydantic.Field(
|
24
|
+
default="",
|
25
|
+
description=(
|
26
|
+
"The actual author name or username. " "Return empty string if not found."
|
27
|
+
),
|
28
|
+
)
|
29
|
+
|
30
|
+
chapter_id: str = pydantic.Field(
|
31
|
+
default="",
|
32
|
+
description=(
|
33
|
+
"The actual chapter ID or identifier (e.g., '12345', 'ch-001'). "
|
34
|
+
"Return empty string if not found."
|
35
|
+
),
|
36
|
+
)
|
37
|
+
|
38
|
+
chapter_number: str = pydantic.Field(
|
39
|
+
default="",
|
40
|
+
description=(
|
41
|
+
"The actual chapter number (e.g., '1', '42', 'Chapter 5'). "
|
42
|
+
"Return empty string if not found."
|
43
|
+
),
|
44
|
+
)
|
45
|
+
|
46
|
+
content_body_css_selector: str = pydantic.Field(
|
47
|
+
default="",
|
48
|
+
description=(
|
49
|
+
"CSS selector for the main content body element "
|
50
|
+
"containing article text only. "
|
51
|
+
"Exclude metadata like title, author, dates. "
|
52
|
+
"Example: 'div.article-body', 'div#novel-content'. "
|
53
|
+
"Use standard CSS syntax. Return empty string if not found."
|
54
|
+
),
|
55
|
+
)
|
56
|
+
|
57
|
+
created_date: str = pydantic.Field(
|
58
|
+
default="",
|
59
|
+
description=(
|
60
|
+
"The content creation date in ISO 8601 format "
|
61
|
+
"with Asia/Taipei timezone "
|
62
|
+
"(e.g., '2025-10-12T14:30:00+08:00'). "
|
63
|
+
"Parse relative dates like '2 days ago' "
|
64
|
+
"using the current_time provided in the system prompt. "
|
65
|
+
"Return empty string if not found."
|
66
|
+
),
|
67
|
+
)
|
68
|
+
|
69
|
+
updated_date: str = pydantic.Field(
|
70
|
+
default="",
|
71
|
+
description=(
|
72
|
+
"The content last update date in ISO 8601 format "
|
73
|
+
"with Asia/Taipei timezone "
|
74
|
+
"(e.g., '2025-10-12T14:30:00+08:00'). "
|
75
|
+
"Parse relative dates like '2 days ago' "
|
76
|
+
"using the current_time provided in the system prompt. "
|
77
|
+
"Return empty string if not found."
|
78
|
+
),
|
79
|
+
)
|
80
|
+
|
81
|
+
# Private attributes
|
82
|
+
_html: str = pydantic.PrivateAttr(default="")
|
@@ -0,0 +1,34 @@
|
|
1
|
+
import enum
|
2
|
+
import json
|
3
|
+
import typing
|
4
|
+
|
5
|
+
import pydantic
|
6
|
+
|
7
|
+
|
8
|
+
class MessageStatus(enum.StrEnum):
|
9
|
+
PENDING = "pending"
|
10
|
+
RUNNING = "running"
|
11
|
+
COMPLETED = "completed"
|
12
|
+
FAILED = "failed"
|
13
|
+
|
14
|
+
|
15
|
+
class Message(pydantic.BaseModel):
|
16
|
+
id: str | None = None
|
17
|
+
data: typing.Any
|
18
|
+
status: MessageStatus = pydantic.Field(default=MessageStatus.PENDING)
|
19
|
+
total_steps: int = pydantic.Field(default=100)
|
20
|
+
completed_steps: int = pydantic.Field(default=0)
|
21
|
+
error: typing.Optional[str] = pydantic.Field(default=None)
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def from_any(cls, any: typing.Union[pydantic.BaseModel, typing.Dict, str, bytes]):
|
25
|
+
if isinstance(any, pydantic.BaseModel):
|
26
|
+
return cls.model_validate_json(any.model_dump_json())
|
27
|
+
elif isinstance(any, typing.Dict):
|
28
|
+
return cls.model_validate_json(json.dumps(any))
|
29
|
+
elif isinstance(any, str):
|
30
|
+
return cls.model_validate_json(any)
|
31
|
+
elif isinstance(any, bytes):
|
32
|
+
return cls.model_validate_json(any)
|
33
|
+
else:
|
34
|
+
raise ValueError(f"Invalid type: {type(any)}")
|
File without changes
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import base64
|
2
|
+
import typing
|
3
|
+
|
4
|
+
import fastapi
|
5
|
+
import zstandard
|
6
|
+
|
7
|
+
|
8
|
+
def compress(
|
9
|
+
data: str, *, level: int = 9, format: typing.Literal["zstd"] = "zstd"
|
10
|
+
) -> str:
|
11
|
+
if format == "zstd":
|
12
|
+
return base64.b64encode(
|
13
|
+
zstandard.compress(data.encode("utf-8"), level=level)
|
14
|
+
).decode("utf-8")
|
15
|
+
else:
|
16
|
+
raise fastapi.exceptions.HTTPException(
|
17
|
+
status_code=400, detail=f"Invalid format: {format}"
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
def decompress(data: str, *, format: typing.Literal["zstd"] = "zstd") -> str:
|
22
|
+
if format == "zstd":
|
23
|
+
return zstandard.decompress(base64.b64decode(data.encode("utf-8"))).decode(
|
24
|
+
"utf-8"
|
25
|
+
)
|
26
|
+
else:
|
27
|
+
raise fastapi.exceptions.HTTPException(
|
28
|
+
status_code=400, detail=f"Invalid format: {format}"
|
29
|
+
)
|
@@ -0,0 +1,145 @@
|
|
1
|
+
import re
|
2
|
+
import typing
|
3
|
+
|
4
|
+
import bs4
|
5
|
+
|
6
|
+
DEFAULT_KEEP_TAGS: typing.Tuple[typing.Text, ...] = (
|
7
|
+
"a",
|
8
|
+
"article",
|
9
|
+
"body",
|
10
|
+
"br",
|
11
|
+
"div",
|
12
|
+
"h1",
|
13
|
+
"h2",
|
14
|
+
"h3",
|
15
|
+
"h4",
|
16
|
+
"h5",
|
17
|
+
"h6",
|
18
|
+
"hr",
|
19
|
+
"html",
|
20
|
+
"li",
|
21
|
+
"main",
|
22
|
+
"ol",
|
23
|
+
"p",
|
24
|
+
"section",
|
25
|
+
"table",
|
26
|
+
"tbody",
|
27
|
+
"td",
|
28
|
+
"th",
|
29
|
+
"tr",
|
30
|
+
"ul",
|
31
|
+
)
|
32
|
+
DEFAULT_KEEP_ATTRIBUTES: typing.Tuple[typing.Text, ...] = ("id", "class")
|
33
|
+
DEFAULT_DROP_TAGS: typing.Tuple[typing.Text, ...] = ("script", "style", "iframe")
|
34
|
+
|
35
|
+
|
36
|
+
class HTMLCleaner:
|
37
|
+
@staticmethod
|
38
|
+
def clean_as_main_content_html(
|
39
|
+
html: typing.Text | bs4.BeautifulSoup,
|
40
|
+
) -> bs4.BeautifulSoup:
|
41
|
+
html = (
|
42
|
+
bs4.BeautifulSoup(html, "html.parser")
|
43
|
+
if isinstance(html, typing.Text)
|
44
|
+
else html
|
45
|
+
)
|
46
|
+
html = HTMLCleaner.clean_all_comments(html)
|
47
|
+
html = HTMLCleaner.keep_only_tags(html)
|
48
|
+
html = HTMLCleaner.clean_tags(html)
|
49
|
+
html = HTMLCleaner.clean_attributes(html)
|
50
|
+
html = HTMLCleaner.keep_first_class_name(html)
|
51
|
+
return html
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def clean_as_main_content_html_str(
|
55
|
+
html: typing.Text | bs4.BeautifulSoup,
|
56
|
+
) -> str:
|
57
|
+
html = HTMLCleaner.clean_as_main_content_html(html)
|
58
|
+
return re.sub(r">\s+<", "><", str(html))
|
59
|
+
|
60
|
+
@staticmethod
|
61
|
+
def keep_only_tags(
|
62
|
+
html: typing.Text | bs4.BeautifulSoup,
|
63
|
+
tags: typing.List[typing.Text] | None = None,
|
64
|
+
) -> bs4.BeautifulSoup:
|
65
|
+
html = (
|
66
|
+
bs4.BeautifulSoup(html, "html.parser")
|
67
|
+
if isinstance(html, typing.Text)
|
68
|
+
else html
|
69
|
+
)
|
70
|
+
tags = tags or list(DEFAULT_KEEP_TAGS)
|
71
|
+
|
72
|
+
# Find all tags that are not in the keep list and decompose them
|
73
|
+
for tag in html.find_all():
|
74
|
+
if tag.name not in tags:
|
75
|
+
tag.decompose()
|
76
|
+
|
77
|
+
return html
|
78
|
+
|
79
|
+
@staticmethod
|
80
|
+
def keep_first_class_name(
|
81
|
+
html: typing.Text | bs4.BeautifulSoup,
|
82
|
+
) -> bs4.BeautifulSoup:
|
83
|
+
html = (
|
84
|
+
bs4.BeautifulSoup(html, "html.parser")
|
85
|
+
if isinstance(html, typing.Text)
|
86
|
+
else html
|
87
|
+
)
|
88
|
+
|
89
|
+
# Keep only the first class name for elements with multiple classes
|
90
|
+
for tag in html.find_all(attrs={"class": True}):
|
91
|
+
class_attr = tag.get("class")
|
92
|
+
if isinstance(class_attr, list) and len(class_attr) > 1:
|
93
|
+
tag["class"] = class_attr[0]
|
94
|
+
elif isinstance(class_attr, str):
|
95
|
+
classes = class_attr.split()
|
96
|
+
if len(classes) > 1:
|
97
|
+
tag["class"] = classes[0]
|
98
|
+
|
99
|
+
return html
|
100
|
+
|
101
|
+
@staticmethod
|
102
|
+
def clean_attributes(
|
103
|
+
html: typing.Text | bs4.BeautifulSoup,
|
104
|
+
attributes: typing.List[typing.Text] | None = None,
|
105
|
+
) -> bs4.BeautifulSoup:
|
106
|
+
html = (
|
107
|
+
bs4.BeautifulSoup(html, "html.parser")
|
108
|
+
if isinstance(html, typing.Text)
|
109
|
+
else html
|
110
|
+
)
|
111
|
+
attributes = attributes or list(DEFAULT_KEEP_ATTRIBUTES)
|
112
|
+
for tag in html.find_all():
|
113
|
+
for attribute in list(tag.attrs):
|
114
|
+
if attribute not in attributes:
|
115
|
+
tag.attrs.pop(attribute, None)
|
116
|
+
|
117
|
+
return html
|
118
|
+
|
119
|
+
@staticmethod
|
120
|
+
def clean_all_comments(html: typing.Text | bs4.BeautifulSoup) -> bs4.BeautifulSoup:
|
121
|
+
html = (
|
122
|
+
bs4.BeautifulSoup(html, "html.parser")
|
123
|
+
if isinstance(html, typing.Text)
|
124
|
+
else html
|
125
|
+
)
|
126
|
+
for comment in html.find_all(text=lambda text: isinstance(text, bs4.Comment)):
|
127
|
+
comment.decompose()
|
128
|
+
return html
|
129
|
+
|
130
|
+
@staticmethod
|
131
|
+
def clean_tags(
|
132
|
+
html: typing.Text | bs4.BeautifulSoup,
|
133
|
+
tags: typing.List[typing.Text] | None = None,
|
134
|
+
) -> bs4.BeautifulSoup:
|
135
|
+
html = (
|
136
|
+
bs4.BeautifulSoup(html, "html.parser")
|
137
|
+
if isinstance(html, typing.Text)
|
138
|
+
else html
|
139
|
+
)
|
140
|
+
tags = tags or list(DEFAULT_DROP_TAGS)
|
141
|
+
|
142
|
+
for tag in html.find_all(tags):
|
143
|
+
tag.decompose()
|
144
|
+
|
145
|
+
return html
|
@@ -0,0 +1,21 @@
|
|
1
|
+
import typing
|
2
|
+
|
3
|
+
import bs4
|
4
|
+
|
5
|
+
|
6
|
+
def html_to_str(html: bs4.BeautifulSoup | bs4.Tag | str) -> str:
|
7
|
+
html = bs4.BeautifulSoup(html, "html.parser") if isinstance(html, str) else html
|
8
|
+
|
9
|
+
full_text = ""
|
10
|
+
for p in html.find_all("p"):
|
11
|
+
content = p.get_text(separator="\n", strip=True)
|
12
|
+
full_text += content
|
13
|
+
full_text += "\n"
|
14
|
+
|
15
|
+
return full_text.strip()
|
16
|
+
|
17
|
+
|
18
|
+
def htmls_to_str(
|
19
|
+
htmls: typing.List[bs4.BeautifulSoup | bs4.Tag | str] | bs4.ResultSet[bs4.Tag],
|
20
|
+
) -> str:
|
21
|
+
return "\n\n".join(html_to_str(h) for h in htmls)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import asyncio
|
2
|
+
import random
|
3
|
+
|
4
|
+
|
5
|
+
async def human_delay(
|
6
|
+
base_delay: float = 1.2, *, jitter_ratio: tuple[float, float] = (0.5, 1.5)
|
7
|
+
) -> None:
|
8
|
+
jitter = random.uniform(jitter_ratio[0], jitter_ratio[1])
|
9
|
+
total_delay = base_delay * jitter
|
10
|
+
await asyncio.sleep(total_delay)
|
11
|
+
return None
|
@@ -0,0 +1,15 @@
|
|
1
|
+
import playwright.async_api
|
2
|
+
|
3
|
+
|
4
|
+
async def page_with_init_script(
|
5
|
+
page: playwright.async_api.Page,
|
6
|
+
) -> playwright.async_api.Page:
|
7
|
+
await page.add_init_script(
|
8
|
+
"""
|
9
|
+
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
10
|
+
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
11
|
+
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'ja']}); # noqa: E501
|
12
|
+
window.chrome = {runtime: {}};
|
13
|
+
"""
|
14
|
+
)
|
15
|
+
return page
|
@@ -0,0 +1,49 @@
|
|
1
|
+
import asyncio
|
2
|
+
import math
|
3
|
+
import random
|
4
|
+
import typing
|
5
|
+
|
6
|
+
import playwright.async_api
|
7
|
+
from playwright._impl._api_structures import ViewportSize
|
8
|
+
|
9
|
+
Number: typing.TypeAlias = float | int
|
10
|
+
|
11
|
+
|
12
|
+
async def simulate_mouse_circling(
|
13
|
+
page: playwright.async_api.Page,
|
14
|
+
default_viewport_size: ViewportSize | None = None,
|
15
|
+
*,
|
16
|
+
start_position: tuple[Number, Number] | None = None,
|
17
|
+
) -> tuple[Number, Number]:
|
18
|
+
_viewport_size = (
|
19
|
+
page.viewport_size
|
20
|
+
or default_viewport_size
|
21
|
+
or ViewportSize(width=1920, height=1080)
|
22
|
+
)
|
23
|
+
|
24
|
+
# Random starting position
|
25
|
+
if start_position:
|
26
|
+
start_x = start_position[0]
|
27
|
+
start_y = start_position[1]
|
28
|
+
else:
|
29
|
+
start_x = random.randint(100, _viewport_size["width"] - 100)
|
30
|
+
start_y = random.randint(100, _viewport_size["height"] - 100)
|
31
|
+
center_x = start_x + 100
|
32
|
+
center_y = start_y + 100
|
33
|
+
radius = 50
|
34
|
+
x = center_x
|
35
|
+
y = center_y
|
36
|
+
|
37
|
+
# Simulate smooth circle: Move to multiple points
|
38
|
+
for angle in range(0, 360, 60): # Every 30 degrees a point
|
39
|
+
rad = (angle * 3.14159) / 180
|
40
|
+
x = center_x + radius * random.uniform(0.8, 1.2) * random.choice([-1, 1]) * abs(
|
41
|
+
math.cos(rad)
|
42
|
+
)
|
43
|
+
y = center_y + radius * random.uniform(0.8, 1.2) * random.choice([-1, 1]) * abs(
|
44
|
+
math.sin(rad)
|
45
|
+
)
|
46
|
+
await page.mouse.move(x, y, steps=random.randint(10, 20)) # Smooth movement
|
47
|
+
await asyncio.sleep(random.uniform(0.01, 0.05)) # Tiny delay
|
48
|
+
|
49
|
+
return (x, y)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
import playwright.async_api
|
4
|
+
from typing_extensions import Literal
|
5
|
+
|
6
|
+
|
7
|
+
async def simulate_scrolling(
|
8
|
+
page: playwright.async_api.Page,
|
9
|
+
scroll_direction: Literal["down", "up"] | None = None,
|
10
|
+
scroll_distance: int | None = None,
|
11
|
+
) -> None:
|
12
|
+
scroll_direction = scroll_direction or random.choice(["down", "up"])
|
13
|
+
scroll_distance = scroll_distance or random.randint(200, 800)
|
14
|
+
if scroll_direction == "down":
|
15
|
+
await page.mouse.wheel(0, scroll_distance)
|
16
|
+
else:
|
17
|
+
await page.mouse.wheel(0, -scroll_distance)
|
18
|
+
return None
|