web-queue2 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {web_queue2-0.2.0 → web_queue2-0.3.0}/PKG-INFO +3 -1
- {web_queue2-0.2.0 → web_queue2-0.3.0}/pyproject.toml +3 -1
- web_queue2-0.3.0/web_queue/VERSION +1 -0
- web_queue2-0.3.0/web_queue/app.py +127 -0
- web_queue2-0.3.0/web_queue/client/__init__.py +4 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/_client.py +9 -2
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/ai/_ai.py +16 -4
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/config.py +31 -0
- web_queue2-0.3.0/web_queue/client/messages/__init__.py +3 -0
- web_queue2-0.3.0/web_queue/client/messages/_messages.py +116 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/web/_web.py +30 -6
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/message.py +13 -1
- web_queue2-0.3.0/web_queue/types/model_var.py +5 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/html_to_str.py +3 -8
- web_queue2-0.2.0/web_queue/VERSION +0 -1
- web_queue2-0.2.0/web_queue/app.py +0 -91
- web_queue2-0.2.0/web_queue/client/__init__.py +0 -3
- web_queue2-0.2.0/web_queue/config.py +0 -47
- web_queue2-0.2.0/web_queue/types/step_callback.py +0 -10
- {web_queue2-0.2.0 → web_queue2-0.3.0}/LICENSE +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/README.md +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/__init__.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/ai/__init__.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/clean/__init__.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/clean/_clean.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/web/__init__.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/__init__.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/fetch_html_message.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/html_content.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/html_metadata_response.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/__init__.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/compression.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/html_cleaner.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/human_delay.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/page_with_init_script.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/simulate_mouse_circling.py +0 -0
- {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/simulate_scrolling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: web-queue2
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Get web content from queue.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -17,6 +17,7 @@ Requires-Dist: bs4
|
|
|
17
17
|
Requires-Dist: cachetic
|
|
18
18
|
Requires-Dist: dictpress
|
|
19
19
|
Requires-Dist: fastapi
|
|
20
|
+
Requires-Dist: html-to-markdown
|
|
20
21
|
Requires-Dist: httpx
|
|
21
22
|
Requires-Dist: huey
|
|
22
23
|
Requires-Dist: logfire[redis]
|
|
@@ -28,6 +29,7 @@ Requires-Dist: pydantic (>=2)
|
|
|
28
29
|
Requires-Dist: pydantic-settings
|
|
29
30
|
Requires-Dist: str-or-none
|
|
30
31
|
Requires-Dist: tiktoken
|
|
32
|
+
Requires-Dist: typer
|
|
31
33
|
Requires-Dist: yarl
|
|
32
34
|
Project-URL: Homepage, https://github.com/allen2c/web-queue
|
|
33
35
|
Project-URL: PyPI, https://pypi.org/project/web-queue/
|
|
@@ -5,6 +5,7 @@ dependencies = [
|
|
|
5
5
|
"cachetic",
|
|
6
6
|
"dictpress",
|
|
7
7
|
"fastapi",
|
|
8
|
+
"html-to-markdown",
|
|
8
9
|
"httpx",
|
|
9
10
|
"huey",
|
|
10
11
|
"logfire[redis]",
|
|
@@ -16,6 +17,7 @@ dependencies = [
|
|
|
16
17
|
"pydantic-settings",
|
|
17
18
|
"str-or-none",
|
|
18
19
|
"tiktoken",
|
|
20
|
+
"typer",
|
|
19
21
|
"yarl",
|
|
20
22
|
]
|
|
21
23
|
description = "Get web content from queue."
|
|
@@ -23,7 +25,7 @@ license = { text = "MIT" }
|
|
|
23
25
|
name = "web-queue2"
|
|
24
26
|
readme = "README.md"
|
|
25
27
|
requires-python = ">=3.11,<4"
|
|
26
|
-
version = "0.
|
|
28
|
+
version = "0.3.0"
|
|
27
29
|
|
|
28
30
|
[project.urls]
|
|
29
31
|
Homepage = "https://github.com/allen2c/web-queue"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.3.0
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
import huey
|
|
7
|
+
import huey.exceptions
|
|
8
|
+
import logfire
|
|
9
|
+
import logging_bullet_train as lbt
|
|
10
|
+
from huey.api import Task
|
|
11
|
+
|
|
12
|
+
from web_queue.client import Settings, WebQueueClient
|
|
13
|
+
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
|
14
|
+
from web_queue.types.html_content import HTMLContent
|
|
15
|
+
from web_queue.types.message import MessageStatus, MessageUpdate
|
|
16
|
+
from web_queue.types.model_var import ModelVar
|
|
17
|
+
|
|
18
|
+
lbt.set_logger("web_queue")
|
|
19
|
+
|
|
20
|
+
logfire.configure()
|
|
21
|
+
logfire.instrument_openai()
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
logger.info("Web queue app starting...")
|
|
26
|
+
|
|
27
|
+
wq_settings = Settings()
|
|
28
|
+
wq_client = WebQueueClient(wq_settings)
|
|
29
|
+
logger.info(f"Web queue connecting to redis: {wq_settings.web_queue_safe_url}")
|
|
30
|
+
|
|
31
|
+
huey_app = huey.RedisExpireHuey(
|
|
32
|
+
wq_settings.WEB_QUEUE_NAME,
|
|
33
|
+
url=wq_settings.WEB_QUEUE_URL.get_secret_value(),
|
|
34
|
+
expire_time=24 * 60 * 60, # 24 hours
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def retrieve_result(task_id: str) -> typing.Optional[typing.Text]:
|
|
39
|
+
try:
|
|
40
|
+
result: str | None = huey_app.result(
|
|
41
|
+
task_id,
|
|
42
|
+
blocking=True,
|
|
43
|
+
timeout=2,
|
|
44
|
+
) # type: ignore
|
|
45
|
+
except huey.exceptions.ResultTimeout:
|
|
46
|
+
logger.error(f"Timeout waiting for result for task {task_id}")
|
|
47
|
+
return None
|
|
48
|
+
if result is None:
|
|
49
|
+
logger.error(f"No result found for task {task_id}")
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def retrieve_result_as(task_id: str, model: typing.Type[ModelVar]) -> ModelVar:
|
|
54
|
+
result = retrieve_result(task_id)
|
|
55
|
+
if result is None:
|
|
56
|
+
raise fastapi.HTTPException(
|
|
57
|
+
status_code=404, detail=f"No result found for task {task_id}"
|
|
58
|
+
)
|
|
59
|
+
return model.model_validate_json(result)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@huey_app.task(
|
|
63
|
+
retries=1,
|
|
64
|
+
retry_delay=8,
|
|
65
|
+
expires=24 * 60 * 60,
|
|
66
|
+
context=True,
|
|
67
|
+
)
|
|
68
|
+
def fetch_html(
|
|
69
|
+
message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
|
|
70
|
+
) -> typing.Optional[typing.Text]:
|
|
71
|
+
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
|
72
|
+
|
|
73
|
+
global wq_client
|
|
74
|
+
|
|
75
|
+
message = FetchHTMLMessage.from_any(message)
|
|
76
|
+
message.id = task.id
|
|
77
|
+
message.status = MessageStatus.RUNNING
|
|
78
|
+
|
|
79
|
+
loop = asyncio.new_event_loop()
|
|
80
|
+
asyncio.set_event_loop(loop)
|
|
81
|
+
|
|
82
|
+
update_message_func = wq_client.messages.wrap_update_message(message.id, message)
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
|
|
86
|
+
update_message_func(
|
|
87
|
+
MessageUpdate(
|
|
88
|
+
total_steps=100,
|
|
89
|
+
completed_steps=0,
|
|
90
|
+
status=MessageStatus.RUNNING,
|
|
91
|
+
message_text="Starting to fetch HTML...",
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
html_content: "HTMLContent" = loop.run_until_complete(
|
|
96
|
+
wq_client.fetch(
|
|
97
|
+
**message.data.model_dump(), step_callback=update_message_func
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
update_message_func(
|
|
102
|
+
MessageUpdate(
|
|
103
|
+
total_steps=100,
|
|
104
|
+
completed_steps=100,
|
|
105
|
+
status=MessageStatus.COMPLETED,
|
|
106
|
+
message_text="Finished fetching HTML",
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return html_content.model_dump_json()
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.exception(e)
|
|
114
|
+
logger.error(f"Failed to fetch HTML: {e}")
|
|
115
|
+
update_message_func(
|
|
116
|
+
MessageUpdate(
|
|
117
|
+
total_steps=100,
|
|
118
|
+
completed_steps=100,
|
|
119
|
+
status=MessageStatus.FAILED,
|
|
120
|
+
message_text=f"Failed to fetch HTML: {e}",
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
finally:
|
|
125
|
+
loop.close()
|
|
126
|
+
|
|
127
|
+
return None
|
|
@@ -8,9 +8,10 @@ if typing.TYPE_CHECKING:
|
|
|
8
8
|
from web_queue.client.ai import AI
|
|
9
9
|
from web_queue.client.clean import Clean
|
|
10
10
|
from web_queue.client.config import Settings
|
|
11
|
+
from web_queue.client.messages import Messages
|
|
11
12
|
from web_queue.client.web import Web
|
|
12
13
|
from web_queue.types.html_content import HTMLContent
|
|
13
|
-
from web_queue.types.
|
|
14
|
+
from web_queue.types.message import MessageUpdate
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class WebQueueClient:
|
|
@@ -37,6 +38,12 @@ class WebQueueClient:
|
|
|
37
38
|
|
|
38
39
|
return AI(self)
|
|
39
40
|
|
|
41
|
+
@functools.cached_property
|
|
42
|
+
def messages(self) -> "Messages":
|
|
43
|
+
from web_queue.client.messages import Messages
|
|
44
|
+
|
|
45
|
+
return Messages(self)
|
|
46
|
+
|
|
40
47
|
async def fetch(
|
|
41
48
|
self,
|
|
42
49
|
url: yarl.URL | httpx.URL | str,
|
|
@@ -47,7 +54,7 @@ class WebQueueClient:
|
|
|
47
54
|
scrolling_times: int = 3,
|
|
48
55
|
human_delay_base_delay: float = 1.2,
|
|
49
56
|
dynamic_content_loading_delay: float = 2.0,
|
|
50
|
-
step_callback: typing.Optional["
|
|
57
|
+
step_callback: typing.Optional[typing.Callable[["MessageUpdate"], None]] = None,
|
|
51
58
|
) -> "HTMLContent":
|
|
52
59
|
from web_queue.types.html_content import HTMLContent
|
|
53
60
|
from web_queue.utils.html_to_str import htmls_to_str
|
|
@@ -11,7 +11,7 @@ from rich.pretty import pretty_repr
|
|
|
11
11
|
|
|
12
12
|
from web_queue.client import WebQueueClient
|
|
13
13
|
from web_queue.types.html_metadata_response import HTMLMetadataResponse
|
|
14
|
-
from web_queue.types.
|
|
14
|
+
from web_queue.types.message import MessageUpdate
|
|
15
15
|
from web_queue.utils.compression import compress, decompress
|
|
16
16
|
|
|
17
17
|
if typing.TYPE_CHECKING:
|
|
@@ -28,7 +28,7 @@ class AI:
|
|
|
28
28
|
async def as_html_metadata(
|
|
29
29
|
self,
|
|
30
30
|
html: typing.Union["bs4.BeautifulSoup", typing.Text],
|
|
31
|
-
step_callback: typing.Optional[
|
|
31
|
+
step_callback: typing.Optional[typing.Callable[["MessageUpdate"], None]] = None,
|
|
32
32
|
) -> typing.Optional[HTMLMetadataResponse]:
|
|
33
33
|
"""Extract content metadata and CSS selector from HTML.
|
|
34
34
|
|
|
@@ -110,7 +110,13 @@ class AI:
|
|
|
110
110
|
).strip()
|
|
111
111
|
|
|
112
112
|
if step_callback:
|
|
113
|
-
step_callback(
|
|
113
|
+
step_callback(
|
|
114
|
+
MessageUpdate(
|
|
115
|
+
total_steps=100,
|
|
116
|
+
completed_steps=75,
|
|
117
|
+
message_text="Starting to extract content metadata...",
|
|
118
|
+
)
|
|
119
|
+
)
|
|
114
120
|
|
|
115
121
|
try:
|
|
116
122
|
parsed_cmpl = await openai_client.chat.completions.parse(
|
|
@@ -139,7 +145,13 @@ class AI:
|
|
|
139
145
|
)
|
|
140
146
|
|
|
141
147
|
if step_callback:
|
|
142
|
-
step_callback(
|
|
148
|
+
step_callback(
|
|
149
|
+
MessageUpdate(
|
|
150
|
+
total_steps=100,
|
|
151
|
+
completed_steps=90,
|
|
152
|
+
message_text="Finished extracting content metadata",
|
|
153
|
+
)
|
|
154
|
+
)
|
|
143
155
|
|
|
144
156
|
return output
|
|
145
157
|
|
|
@@ -6,9 +6,16 @@ import cachetic
|
|
|
6
6
|
import openai
|
|
7
7
|
import pydantic as pydantic
|
|
8
8
|
import pydantic_settings
|
|
9
|
+
from str_or_none import str_or_none
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class Settings(pydantic_settings.BaseSettings):
|
|
13
|
+
# Core
|
|
14
|
+
WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
|
|
15
|
+
WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
|
|
16
|
+
MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
|
|
17
|
+
|
|
18
|
+
# AI
|
|
12
19
|
OPENAI_MODEL: str = pydantic.Field(default="gpt-4.1-nano")
|
|
13
20
|
OPENAI_API_KEY: pydantic.SecretStr = pydantic.SecretStr("")
|
|
14
21
|
|
|
@@ -24,6 +31,24 @@ class Settings(pydantic_settings.BaseSettings):
|
|
|
24
31
|
default=60 * 60 * 24
|
|
25
32
|
) # 1 day
|
|
26
33
|
|
|
34
|
+
@pydantic.model_validator(mode="after")
|
|
35
|
+
def validate_values(self) -> typing.Self:
|
|
36
|
+
if str_or_none(self.WEB_QUEUE_NAME) is None:
|
|
37
|
+
raise ValueError("WEB_QUEUE_NAME is required")
|
|
38
|
+
if str_or_none(self.WEB_QUEUE_URL.get_secret_value()) is None:
|
|
39
|
+
raise ValueError("WEB_QUEUE_URL is required")
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
@functools.cached_property
|
|
43
|
+
def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
|
|
44
|
+
import redis
|
|
45
|
+
|
|
46
|
+
return cachetic.Cachetic(
|
|
47
|
+
object_type=pydantic.TypeAdapter(typing.Text),
|
|
48
|
+
cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
|
|
49
|
+
default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
|
|
50
|
+
)
|
|
51
|
+
|
|
27
52
|
@functools.cached_property
|
|
28
53
|
def openai_client(self) -> openai.AsyncOpenAI:
|
|
29
54
|
return openai.AsyncOpenAI(api_key=self.OPENAI_API_KEY.get_secret_value())
|
|
@@ -48,6 +73,12 @@ class Settings(pydantic_settings.BaseSettings):
|
|
|
48
73
|
default_ttl=self.COMPRESSED_BASE64_CACHE_EXPIRE_SECONDS,
|
|
49
74
|
)
|
|
50
75
|
|
|
76
|
+
@property
|
|
77
|
+
def web_queue_safe_url(self) -> str:
|
|
78
|
+
import yarl
|
|
79
|
+
|
|
80
|
+
return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
|
|
81
|
+
|
|
51
82
|
@property
|
|
52
83
|
def web_screenshot_path(self) -> pathlib.Path:
|
|
53
84
|
_path = pathlib.Path(self.WEB_SCREENSHOT_PATH)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
import fastapi
|
|
5
|
+
|
|
6
|
+
from web_queue.client import WebQueueClient
|
|
7
|
+
from web_queue.types.message import Message, MessageStatus, MessageUpdate, MessageVar
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Messages:
|
|
11
|
+
def __init__(self, client: WebQueueClient):
|
|
12
|
+
self.client = client
|
|
13
|
+
|
|
14
|
+
def get(self, message_id: str, *, timeout: float = 10.0) -> typing.Optional[str]:
|
|
15
|
+
ts = time.perf_counter()
|
|
16
|
+
while time.perf_counter() - ts < timeout:
|
|
17
|
+
message_cache_key = self.get_cache_key(message_id)
|
|
18
|
+
maybe_json = self.client.settings.message_cache.get(message_cache_key)
|
|
19
|
+
if maybe_json is not None:
|
|
20
|
+
break
|
|
21
|
+
else:
|
|
22
|
+
time.sleep(0.1)
|
|
23
|
+
|
|
24
|
+
if maybe_json is None:
|
|
25
|
+
return None
|
|
26
|
+
return maybe_json
|
|
27
|
+
|
|
28
|
+
def retrieve(self, message_id: str, *, timeout: float = 10.0) -> str:
|
|
29
|
+
json_data = self.get(message_id, timeout=timeout)
|
|
30
|
+
if json_data is None:
|
|
31
|
+
raise fastapi.HTTPException(status_code=404, detail="Message not found")
|
|
32
|
+
return json_data
|
|
33
|
+
|
|
34
|
+
def retrieve_as(
|
|
35
|
+
self, message_id: str, model: typing.Type[MessageVar], *, timeout: float = 10.0
|
|
36
|
+
) -> MessageVar:
|
|
37
|
+
json_data = self.retrieve(message_id, timeout=timeout)
|
|
38
|
+
return model.model_validate_json(json_data)
|
|
39
|
+
|
|
40
|
+
def set(self, message_id: str, message: Message) -> None:
|
|
41
|
+
message_cache_key = self.get_cache_key(message_id)
|
|
42
|
+
self.client.settings.message_cache.set(
|
|
43
|
+
message_cache_key, message.model_dump_json()
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def update(
|
|
47
|
+
self,
|
|
48
|
+
message_id: str,
|
|
49
|
+
message_update: MessageUpdate,
|
|
50
|
+
) -> Message:
|
|
51
|
+
message = self.retrieve_as(message_id, Message)
|
|
52
|
+
if message_update.message_text is not None:
|
|
53
|
+
message.message_text = message_update.message_text
|
|
54
|
+
if message_update.data is not None:
|
|
55
|
+
message.data = message_update.data
|
|
56
|
+
if message_update.status is not None:
|
|
57
|
+
message.status = message_update.status
|
|
58
|
+
if message_update.total_steps is not None:
|
|
59
|
+
message.total_steps = message_update.total_steps
|
|
60
|
+
if message_update.completed_steps is not None:
|
|
61
|
+
message.completed_steps = message_update.completed_steps
|
|
62
|
+
if message_update.error is not None:
|
|
63
|
+
message.error = message_update.error
|
|
64
|
+
self.set(message_id, message)
|
|
65
|
+
|
|
66
|
+
return message
|
|
67
|
+
|
|
68
|
+
def wrap_update_message(
|
|
69
|
+
self, message_id: str, message: Message
|
|
70
|
+
) -> typing.Callable[[MessageUpdate], None]:
|
|
71
|
+
def _update(message_update: MessageUpdate) -> None:
|
|
72
|
+
if message_update.message_text is not None:
|
|
73
|
+
message.message_text = message_update.message_text
|
|
74
|
+
if message_update.data is not None:
|
|
75
|
+
message.data = message_update.data
|
|
76
|
+
if message_update.status is not None:
|
|
77
|
+
message.status = message_update.status
|
|
78
|
+
if message_update.total_steps is not None:
|
|
79
|
+
message.total_steps = message_update.total_steps
|
|
80
|
+
if message_update.completed_steps is not None:
|
|
81
|
+
message.completed_steps = message_update.completed_steps
|
|
82
|
+
if message_update.error is not None:
|
|
83
|
+
message.error = message_update.error
|
|
84
|
+
self.set(message_id, message)
|
|
85
|
+
|
|
86
|
+
return _update
|
|
87
|
+
|
|
88
|
+
def poll_util_done(
|
|
89
|
+
self,
|
|
90
|
+
message_id: str,
|
|
91
|
+
*,
|
|
92
|
+
timeout: float = 60.0,
|
|
93
|
+
model: typing.Type[MessageVar],
|
|
94
|
+
delay: float = 0.2,
|
|
95
|
+
) -> MessageVar:
|
|
96
|
+
ts = time.perf_counter()
|
|
97
|
+
msg: MessageVar | None = None
|
|
98
|
+
|
|
99
|
+
while is_timeout := (time.perf_counter() - ts < timeout):
|
|
100
|
+
msg = self.retrieve_as(message_id, model)
|
|
101
|
+
if msg.status in [MessageStatus.COMPLETED, MessageStatus.FAILED]:
|
|
102
|
+
break
|
|
103
|
+
time.sleep(delay)
|
|
104
|
+
|
|
105
|
+
if msg is None:
|
|
106
|
+
raise fastapi.HTTPException(status_code=404, detail="Message not found")
|
|
107
|
+
|
|
108
|
+
if is_timeout:
|
|
109
|
+
raise fastapi.HTTPException(
|
|
110
|
+
status_code=408, detail="Timeout waiting for message to be done"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return msg
|
|
114
|
+
|
|
115
|
+
def get_cache_key(self, message_id: str) -> str:
|
|
116
|
+
return f"{self.client.settings.WEB_QUEUE_NAME}:message:{message_id}"
|
|
@@ -14,7 +14,7 @@ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
|
|
14
14
|
from str_or_none import str_or_none
|
|
15
15
|
|
|
16
16
|
from web_queue.client import WebQueueClient
|
|
17
|
-
from web_queue.types.
|
|
17
|
+
from web_queue.types.message import MessageUpdate
|
|
18
18
|
from web_queue.utils.compression import compress, decompress
|
|
19
19
|
from web_queue.utils.human_delay import human_delay
|
|
20
20
|
from web_queue.utils.page_with_init_script import page_with_init_script
|
|
@@ -49,7 +49,7 @@ class Web:
|
|
|
49
49
|
scrolling_times: int = 3,
|
|
50
50
|
human_delay_base_delay: float = 1.2,
|
|
51
51
|
dynamic_content_loading_delay: float = 2.0,
|
|
52
|
-
step_callback: typing.Optional[
|
|
52
|
+
step_callback: typing.Optional[typing.Callable[["MessageUpdate"], None]] = None,
|
|
53
53
|
) -> bs4.BeautifulSoup:
|
|
54
54
|
_url = str_or_none(str(url))
|
|
55
55
|
if not _url:
|
|
@@ -80,7 +80,13 @@ class Web:
|
|
|
80
80
|
],
|
|
81
81
|
)
|
|
82
82
|
if step_callback:
|
|
83
|
-
step_callback(
|
|
83
|
+
step_callback(
|
|
84
|
+
MessageUpdate(
|
|
85
|
+
total_steps=100,
|
|
86
|
+
completed_steps=15,
|
|
87
|
+
message_text="Launching browser...",
|
|
88
|
+
)
|
|
89
|
+
)
|
|
84
90
|
|
|
85
91
|
# Create context
|
|
86
92
|
_viewport_size = secrets.choice(self.VIEWPORT_SIZES)
|
|
@@ -106,7 +112,13 @@ class Web:
|
|
|
106
112
|
page = await page_with_init_script(page)
|
|
107
113
|
|
|
108
114
|
if step_callback:
|
|
109
|
-
step_callback(
|
|
115
|
+
step_callback(
|
|
116
|
+
MessageUpdate(
|
|
117
|
+
total_steps=100,
|
|
118
|
+
completed_steps=30,
|
|
119
|
+
message_text="Navigating to URL...",
|
|
120
|
+
)
|
|
121
|
+
)
|
|
110
122
|
|
|
111
123
|
try:
|
|
112
124
|
# Navigate to URL
|
|
@@ -125,7 +137,13 @@ class Web:
|
|
|
125
137
|
await human_delay(h_delay)
|
|
126
138
|
|
|
127
139
|
if step_callback:
|
|
128
|
-
step_callback(
|
|
140
|
+
step_callback(
|
|
141
|
+
MessageUpdate(
|
|
142
|
+
total_steps=100,
|
|
143
|
+
completed_steps=45,
|
|
144
|
+
message_text="Waiting for full page load...",
|
|
145
|
+
)
|
|
146
|
+
)
|
|
129
147
|
|
|
130
148
|
# Simulate smooth mouse circling three times
|
|
131
149
|
start_position = None
|
|
@@ -156,7 +174,13 @@ class Web:
|
|
|
156
174
|
)
|
|
157
175
|
|
|
158
176
|
if step_callback:
|
|
159
|
-
step_callback(
|
|
177
|
+
step_callback(
|
|
178
|
+
MessageUpdate(
|
|
179
|
+
total_steps=100,
|
|
180
|
+
completed_steps=60,
|
|
181
|
+
message_text="Finished fetching HTML content",
|
|
182
|
+
)
|
|
183
|
+
)
|
|
160
184
|
|
|
161
185
|
# Screenshot and PDF
|
|
162
186
|
snapshot_filename = f"{int(time.time()*1E3)}_{secrets.token_hex(2)}"
|
|
@@ -14,7 +14,7 @@ class MessageStatus(enum.StrEnum):
|
|
|
14
14
|
|
|
15
15
|
class Message(pydantic.BaseModel):
|
|
16
16
|
id: str | None = None
|
|
17
|
-
|
|
17
|
+
message_text: str = ""
|
|
18
18
|
data: typing.Any
|
|
19
19
|
status: MessageStatus = pydantic.Field(default=MessageStatus.PENDING)
|
|
20
20
|
total_steps: int = pydantic.Field(default=100)
|
|
@@ -33,3 +33,15 @@ class Message(pydantic.BaseModel):
|
|
|
33
33
|
return cls.model_validate_json(any)
|
|
34
34
|
else:
|
|
35
35
|
raise ValueError(f"Invalid type: {type(any)}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
MessageVar = typing.TypeVar("MessageVar", bound=Message)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class MessageUpdate(pydantic.BaseModel):
|
|
42
|
+
message_text: typing.Optional[str] = None
|
|
43
|
+
data: typing.Optional[typing.Any] = None
|
|
44
|
+
status: typing.Optional[MessageStatus] = None
|
|
45
|
+
total_steps: typing.Optional[int] = None
|
|
46
|
+
completed_steps: typing.Optional[int] = None
|
|
47
|
+
error: typing.Optional[str] = None
|
|
@@ -1,18 +1,13 @@
|
|
|
1
1
|
import typing
|
|
2
2
|
|
|
3
3
|
import bs4
|
|
4
|
+
import html_to_markdown
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def html_to_str(html: bs4.BeautifulSoup | bs4.Tag | str) -> str:
|
|
7
8
|
html = bs4.BeautifulSoup(html, "html.parser") if isinstance(html, str) else html
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
for p in html.find_all("p"):
|
|
11
|
-
content = p.get_text(separator="\n", strip=True)
|
|
12
|
-
full_text += content
|
|
13
|
-
full_text += "\n"
|
|
14
|
-
|
|
15
|
-
return full_text.strip()
|
|
9
|
+
content = html_to_markdown.convert(str(html)).strip()
|
|
10
|
+
return "\n".join(line.rstrip() for line in content.splitlines())
|
|
16
11
|
|
|
17
12
|
|
|
18
13
|
def htmls_to_str(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.2.0
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import logging
|
|
3
|
-
import typing
|
|
4
|
-
|
|
5
|
-
import huey
|
|
6
|
-
import logfire
|
|
7
|
-
import logging_bullet_train as lbt
|
|
8
|
-
from huey.api import Task
|
|
9
|
-
|
|
10
|
-
import web_queue.config
|
|
11
|
-
from web_queue.types.message import MessageStatus
|
|
12
|
-
|
|
13
|
-
if typing.TYPE_CHECKING:
|
|
14
|
-
from web_queue.client import WebQueueClient
|
|
15
|
-
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
|
16
|
-
from web_queue.types.html_content import HTMLContent
|
|
17
|
-
|
|
18
|
-
lbt.set_logger("web_queue")
|
|
19
|
-
|
|
20
|
-
logfire.configure()
|
|
21
|
-
logfire.instrument_openai()
|
|
22
|
-
|
|
23
|
-
logger = logging.getLogger(__name__)
|
|
24
|
-
|
|
25
|
-
logger.info("Web queue app starting...")
|
|
26
|
-
|
|
27
|
-
web_queue_settings = web_queue.config.Settings()
|
|
28
|
-
logger.info(f"Web queue connecting to redis: {web_queue_settings.web_queue_safe_url}")
|
|
29
|
-
|
|
30
|
-
huey_app = huey.RedisExpireHuey(
|
|
31
|
-
web_queue_settings.WEB_QUEUE_NAME,
|
|
32
|
-
url=web_queue_settings.WEB_QUEUE_URL.get_secret_value(),
|
|
33
|
-
expire_time=24 * 60 * 60, # 24 hours
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@huey_app.task(
|
|
38
|
-
retries=1,
|
|
39
|
-
retry_delay=8,
|
|
40
|
-
expires=24 * 60 * 60,
|
|
41
|
-
context=True,
|
|
42
|
-
)
|
|
43
|
-
def fetch_html(
|
|
44
|
-
message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
|
|
45
|
-
) -> typing.Optional[typing.Text]:
|
|
46
|
-
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
|
47
|
-
|
|
48
|
-
message = FetchHTMLMessage.from_any(message)
|
|
49
|
-
message.id = task.id
|
|
50
|
-
message.status = MessageStatus.RUNNING
|
|
51
|
-
|
|
52
|
-
wq_cache_key = web_queue_settings.get_message_cache_key(message.id)
|
|
53
|
-
|
|
54
|
-
def update_message_cache(
|
|
55
|
-
total_steps: int | None = None,
|
|
56
|
-
completed_steps: int | None = None,
|
|
57
|
-
message_text: str | None = None,
|
|
58
|
-
):
|
|
59
|
-
if total_steps is not None:
|
|
60
|
-
message.total_steps = total_steps
|
|
61
|
-
if completed_steps is not None:
|
|
62
|
-
message.completed_steps = completed_steps
|
|
63
|
-
if message_text is not None:
|
|
64
|
-
message.message = message_text
|
|
65
|
-
web_queue_settings.message_cache.set(wq_cache_key, message.model_dump_json())
|
|
66
|
-
|
|
67
|
-
logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
|
|
68
|
-
update_message_cache(message_text="Starting to fetch HTML...")
|
|
69
|
-
|
|
70
|
-
loop = asyncio.new_event_loop()
|
|
71
|
-
asyncio.set_event_loop(loop)
|
|
72
|
-
|
|
73
|
-
try:
|
|
74
|
-
wq_client: "WebQueueClient" = web_queue_settings.web_queue_client
|
|
75
|
-
html_content: "HTMLContent" = loop.run_until_complete(
|
|
76
|
-
wq_client.fetch(
|
|
77
|
-
**message.data.model_dump(), step_callback=update_message_cache
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
update_message_cache(100, 100, "Finished fetching HTML.")
|
|
81
|
-
return html_content.model_dump_json()
|
|
82
|
-
|
|
83
|
-
except Exception as e:
|
|
84
|
-
logger.exception(e)
|
|
85
|
-
logger.error(f"Failed to fetch HTML: {e}")
|
|
86
|
-
update_message_cache(message_text=f"Failed to fetch HTML: {e}")
|
|
87
|
-
|
|
88
|
-
finally:
|
|
89
|
-
loop.close()
|
|
90
|
-
|
|
91
|
-
return None
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import functools
|
|
2
|
-
import typing
|
|
3
|
-
|
|
4
|
-
import cachetic
|
|
5
|
-
import pydantic
|
|
6
|
-
import pydantic_settings
|
|
7
|
-
import redis
|
|
8
|
-
import yarl
|
|
9
|
-
from str_or_none import str_or_none
|
|
10
|
-
|
|
11
|
-
if typing.TYPE_CHECKING:
|
|
12
|
-
from web_queue.client import WebQueueClient
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class Settings(pydantic_settings.BaseSettings):
|
|
16
|
-
WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
|
|
17
|
-
WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
|
|
18
|
-
MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
|
|
19
|
-
|
|
20
|
-
@pydantic.model_validator(mode="after")
|
|
21
|
-
def validate_values(self) -> typing.Self:
|
|
22
|
-
if str_or_none(self.WEB_QUEUE_NAME) is None:
|
|
23
|
-
raise ValueError("WEB_QUEUE_NAME is required")
|
|
24
|
-
if str_or_none(self.WEB_QUEUE_URL.get_secret_value()) is None:
|
|
25
|
-
raise ValueError("WEB_QUEUE_URL is required")
|
|
26
|
-
return self
|
|
27
|
-
|
|
28
|
-
@functools.cached_property
|
|
29
|
-
def web_queue_client(self) -> "WebQueueClient":
|
|
30
|
-
from web_queue.client import WebQueueClient
|
|
31
|
-
|
|
32
|
-
return WebQueueClient()
|
|
33
|
-
|
|
34
|
-
@functools.cached_property
|
|
35
|
-
def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
|
|
36
|
-
return cachetic.Cachetic(
|
|
37
|
-
object_type=pydantic.TypeAdapter(typing.Text),
|
|
38
|
-
cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
|
|
39
|
-
default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def web_queue_safe_url(self) -> str:
|
|
44
|
-
return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
|
|
45
|
-
|
|
46
|
-
def get_message_cache_key(self, message_id: str) -> str:
|
|
47
|
-
return f"{self.WEB_QUEUE_NAME}:message:{message_id}"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|