web-queue2 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {web_queue2-0.1.0 → web_queue2-0.2.0}/PKG-INFO +2 -2
- {web_queue2-0.1.0 → web_queue2-0.2.0}/pyproject.toml +2 -2
- web_queue2-0.2.0/web_queue/VERSION +1 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/app.py +32 -4
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/_client.py +6 -1
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/ai/_ai.py +14 -2
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/clean/_clean.py +2 -1
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/web/_web.py +13 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/config.py +14 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/message.py +1 -0
- web_queue2-0.2.0/web_queue/types/step_callback.py +10 -0
- web_queue2-0.1.0/web_queue/VERSION +0 -1
- {web_queue2-0.1.0 → web_queue2-0.2.0}/LICENSE +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/README.md +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/ai/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/clean/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/config.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/web/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/fetch_html_message.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/html_content.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/html_metadata_response.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/__init__.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/compression.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/html_cleaner.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/html_to_str.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/human_delay.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/page_with_init_script.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/simulate_mouse_circling.py +0 -0
- {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/simulate_scrolling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: web-queue2
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Get web content from queue.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -19,7 +19,7 @@ Requires-Dist: dictpress
|
|
|
19
19
|
Requires-Dist: fastapi
|
|
20
20
|
Requires-Dist: httpx
|
|
21
21
|
Requires-Dist: huey
|
|
22
|
-
Requires-Dist: logfire
|
|
22
|
+
Requires-Dist: logfire[redis]
|
|
23
23
|
Requires-Dist: logging_bullet_train
|
|
24
24
|
Requires-Dist: openai (>=1,<2)
|
|
25
25
|
Requires-Dist: openai-agents (>=0.1.0,<1.0.0)
|
|
@@ -7,7 +7,7 @@ dependencies = [
|
|
|
7
7
|
"fastapi",
|
|
8
8
|
"httpx",
|
|
9
9
|
"huey",
|
|
10
|
-
"logfire",
|
|
10
|
+
"logfire[redis]",
|
|
11
11
|
"logging_bullet_train",
|
|
12
12
|
"openai (>=1,<2)",
|
|
13
13
|
"openai-agents (>=0.1.0,<1.0.0)",
|
|
@@ -23,7 +23,7 @@ license = { text = "MIT" }
|
|
|
23
23
|
name = "web-queue2"
|
|
24
24
|
readme = "README.md"
|
|
25
25
|
requires-python = ">=3.11,<4"
|
|
26
|
-
version = "0.
|
|
26
|
+
version = "0.2.0"
|
|
27
27
|
|
|
28
28
|
[project.urls]
|
|
29
29
|
Homepage = "https://github.com/allen2c/web-queue"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.0
|
|
@@ -8,6 +8,7 @@ import logging_bullet_train as lbt
|
|
|
8
8
|
from huey.api import Task
|
|
9
9
|
|
|
10
10
|
import web_queue.config
|
|
11
|
+
from web_queue.types.message import MessageStatus
|
|
11
12
|
|
|
12
13
|
if typing.TYPE_CHECKING:
|
|
13
14
|
from web_queue.client import WebQueueClient
|
|
@@ -41,13 +42,30 @@ huey_app = huey.RedisExpireHuey(
|
|
|
41
42
|
)
|
|
42
43
|
def fetch_html(
|
|
43
44
|
message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
|
|
44
|
-
) ->
|
|
45
|
+
) -> typing.Optional[typing.Text]:
|
|
45
46
|
from web_queue.types.fetch_html_message import FetchHTMLMessage
|
|
46
47
|
|
|
47
48
|
message = FetchHTMLMessage.from_any(message)
|
|
48
49
|
message.id = task.id
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
message.status = MessageStatus.RUNNING
|
|
51
|
+
|
|
52
|
+
wq_cache_key = web_queue_settings.get_message_cache_key(message.id)
|
|
53
|
+
|
|
54
|
+
def update_message_cache(
|
|
55
|
+
total_steps: int | None = None,
|
|
56
|
+
completed_steps: int | None = None,
|
|
57
|
+
message_text: str | None = None,
|
|
58
|
+
):
|
|
59
|
+
if total_steps is not None:
|
|
60
|
+
message.total_steps = total_steps
|
|
61
|
+
if completed_steps is not None:
|
|
62
|
+
message.completed_steps = completed_steps
|
|
63
|
+
if message_text is not None:
|
|
64
|
+
message.message = message_text
|
|
65
|
+
web_queue_settings.message_cache.set(wq_cache_key, message.model_dump_json())
|
|
66
|
+
|
|
67
|
+
logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
|
|
68
|
+
update_message_cache(message_text="Starting to fetch HTML...")
|
|
51
69
|
|
|
52
70
|
loop = asyncio.new_event_loop()
|
|
53
71
|
asyncio.set_event_loop(loop)
|
|
@@ -55,9 +73,19 @@ def fetch_html(
|
|
|
55
73
|
try:
|
|
56
74
|
wq_client: "WebQueueClient" = web_queue_settings.web_queue_client
|
|
57
75
|
html_content: "HTMLContent" = loop.run_until_complete(
|
|
58
|
-
wq_client.fetch(
|
|
76
|
+
wq_client.fetch(
|
|
77
|
+
**message.data.model_dump(), step_callback=update_message_cache
|
|
78
|
+
)
|
|
59
79
|
)
|
|
80
|
+
update_message_cache(100, 100, "Finished fetching HTML.")
|
|
60
81
|
return html_content.model_dump_json()
|
|
61
82
|
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.exception(e)
|
|
85
|
+
logger.error(f"Failed to fetch HTML: {e}")
|
|
86
|
+
update_message_cache(message_text=f"Failed to fetch HTML: {e}")
|
|
87
|
+
|
|
62
88
|
finally:
|
|
63
89
|
loop.close()
|
|
90
|
+
|
|
91
|
+
return None
|
|
@@ -10,6 +10,7 @@ if typing.TYPE_CHECKING:
|
|
|
10
10
|
from web_queue.client.config import Settings
|
|
11
11
|
from web_queue.client.web import Web
|
|
12
12
|
from web_queue.types.html_content import HTMLContent
|
|
13
|
+
from web_queue.types.step_callback import StepCallbackType
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
class WebQueueClient:
|
|
@@ -46,6 +47,7 @@ class WebQueueClient:
|
|
|
46
47
|
scrolling_times: int = 3,
|
|
47
48
|
human_delay_base_delay: float = 1.2,
|
|
48
49
|
dynamic_content_loading_delay: float = 2.0,
|
|
50
|
+
step_callback: typing.Optional["StepCallbackType"] = None,
|
|
49
51
|
) -> "HTMLContent":
|
|
50
52
|
from web_queue.types.html_content import HTMLContent
|
|
51
53
|
from web_queue.utils.html_to_str import htmls_to_str
|
|
@@ -59,13 +61,16 @@ class WebQueueClient:
|
|
|
59
61
|
scrolling_times=scrolling_times,
|
|
60
62
|
human_delay_base_delay=human_delay_base_delay,
|
|
61
63
|
dynamic_content_loading_delay=dynamic_content_loading_delay,
|
|
64
|
+
step_callback=step_callback,
|
|
62
65
|
)
|
|
63
66
|
|
|
64
67
|
# Clean HTML
|
|
65
68
|
html = self.clean.as_main_content(html)
|
|
66
69
|
|
|
67
70
|
# Extract content metadata
|
|
68
|
-
html_metadata = await self.ai.as_html_metadata(
|
|
71
|
+
html_metadata = await self.ai.as_html_metadata(
|
|
72
|
+
html, step_callback=step_callback
|
|
73
|
+
)
|
|
69
74
|
|
|
70
75
|
if not html_metadata:
|
|
71
76
|
raise ValueError(f"Failed to retrieve content metadata for url: {url}")
|
|
@@ -11,6 +11,7 @@ from rich.pretty import pretty_repr
|
|
|
11
11
|
|
|
12
12
|
from web_queue.client import WebQueueClient
|
|
13
13
|
from web_queue.types.html_metadata_response import HTMLMetadataResponse
|
|
14
|
+
from web_queue.types.step_callback import StepCallbackType
|
|
14
15
|
from web_queue.utils.compression import compress, decompress
|
|
15
16
|
|
|
16
17
|
if typing.TYPE_CHECKING:
|
|
@@ -25,7 +26,9 @@ class AI:
|
|
|
25
26
|
|
|
26
27
|
@logfire.instrument
|
|
27
28
|
async def as_html_metadata(
|
|
28
|
-
self,
|
|
29
|
+
self,
|
|
30
|
+
html: typing.Union["bs4.BeautifulSoup", typing.Text],
|
|
31
|
+
step_callback: typing.Optional[StepCallbackType] = None,
|
|
29
32
|
) -> typing.Optional[HTMLMetadataResponse]:
|
|
30
33
|
"""Extract content metadata and CSS selector from HTML.
|
|
31
34
|
|
|
@@ -36,7 +39,10 @@ class AI:
|
|
|
36
39
|
|
|
37
40
|
html = str(html)
|
|
38
41
|
|
|
39
|
-
logger.info(
|
|
42
|
+
logger.info(
|
|
43
|
+
"AI is extracting content metadata from HTML: "
|
|
44
|
+
+ f"{pretty_repr(str(html), max_string=64)}"
|
|
45
|
+
)
|
|
40
46
|
|
|
41
47
|
cache_key = (
|
|
42
48
|
"retrieve_html_content_metadata:"
|
|
@@ -103,6 +109,9 @@ class AI:
|
|
|
103
109
|
""" # noqa: E501
|
|
104
110
|
).strip()
|
|
105
111
|
|
|
112
|
+
if step_callback:
|
|
113
|
+
step_callback(100, 75, "Starting to extract content metadata...")
|
|
114
|
+
|
|
106
115
|
try:
|
|
107
116
|
parsed_cmpl = await openai_client.chat.completions.parse(
|
|
108
117
|
messages=[
|
|
@@ -129,6 +138,9 @@ class AI:
|
|
|
129
138
|
compress(output.model_dump_json()),
|
|
130
139
|
)
|
|
131
140
|
|
|
141
|
+
if step_callback:
|
|
142
|
+
step_callback(100, 90, "Finished extracting content metadata.")
|
|
143
|
+
|
|
132
144
|
return output
|
|
133
145
|
|
|
134
146
|
else:
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import typing
|
|
3
3
|
|
|
4
4
|
import bs4
|
|
5
|
+
from rich.pretty import pretty_repr
|
|
5
6
|
|
|
6
7
|
from web_queue.client import WebQueueClient
|
|
7
8
|
from web_queue.utils.html_cleaner import HTMLCleaner
|
|
@@ -20,6 +21,6 @@ class Clean:
|
|
|
20
21
|
else html
|
|
21
22
|
)
|
|
22
23
|
|
|
23
|
-
logger.info(f"Cleaning HTML: {html}")
|
|
24
|
+
logger.info(f"Cleaning HTML: {pretty_repr(str(html), max_string=64)}")
|
|
24
25
|
cleaned_html = HTMLCleaner.clean_as_main_content_html_str(html)
|
|
25
26
|
return bs4.BeautifulSoup(cleaned_html, "html.parser")
|
|
@@ -14,6 +14,7 @@ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
|
|
14
14
|
from str_or_none import str_or_none
|
|
15
15
|
|
|
16
16
|
from web_queue.client import WebQueueClient
|
|
17
|
+
from web_queue.types.step_callback import StepCallbackType
|
|
17
18
|
from web_queue.utils.compression import compress, decompress
|
|
18
19
|
from web_queue.utils.human_delay import human_delay
|
|
19
20
|
from web_queue.utils.page_with_init_script import page_with_init_script
|
|
@@ -48,6 +49,7 @@ class Web:
|
|
|
48
49
|
scrolling_times: int = 3,
|
|
49
50
|
human_delay_base_delay: float = 1.2,
|
|
50
51
|
dynamic_content_loading_delay: float = 2.0,
|
|
52
|
+
step_callback: typing.Optional[StepCallbackType] = None,
|
|
51
53
|
) -> bs4.BeautifulSoup:
|
|
52
54
|
_url = str_or_none(str(url))
|
|
53
55
|
if not _url:
|
|
@@ -77,6 +79,8 @@ class Web:
|
|
|
77
79
|
"--disable-features=VizDisplayCompositor",
|
|
78
80
|
],
|
|
79
81
|
)
|
|
82
|
+
if step_callback:
|
|
83
|
+
step_callback(100, 15, "Launching browser...")
|
|
80
84
|
|
|
81
85
|
# Create context
|
|
82
86
|
_viewport_size = secrets.choice(self.VIEWPORT_SIZES)
|
|
@@ -101,6 +105,9 @@ class Web:
|
|
|
101
105
|
# Inject script to hide automation features
|
|
102
106
|
page = await page_with_init_script(page)
|
|
103
107
|
|
|
108
|
+
if step_callback:
|
|
109
|
+
step_callback(100, 30, "Navigating to URL...")
|
|
110
|
+
|
|
104
111
|
try:
|
|
105
112
|
# Navigate to URL
|
|
106
113
|
logger.debug(f"Navigating (timeout: {goto_timeout}ms) to {_url}")
|
|
@@ -117,6 +124,9 @@ class Web:
|
|
|
117
124
|
await page.wait_for_load_state("domcontentloaded")
|
|
118
125
|
await human_delay(h_delay)
|
|
119
126
|
|
|
127
|
+
if step_callback:
|
|
128
|
+
step_callback(100, 45, "Waiting for full page load...")
|
|
129
|
+
|
|
120
130
|
# Simulate smooth mouse circling three times
|
|
121
131
|
start_position = None
|
|
122
132
|
for i in range(circling_times):
|
|
@@ -145,6 +155,9 @@ class Web:
|
|
|
145
155
|
f"Fetched HTML content size: {html_content_size} for {_url}"
|
|
146
156
|
)
|
|
147
157
|
|
|
158
|
+
if step_callback:
|
|
159
|
+
step_callback(100, 60, "Finished fetching HTML content.")
|
|
160
|
+
|
|
148
161
|
# Screenshot and PDF
|
|
149
162
|
snapshot_filename = f"{int(time.time()*1E3)}_{secrets.token_hex(2)}"
|
|
150
163
|
screenshot_path = self.client.settings.web_screenshot_path.joinpath(
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import typing
|
|
3
3
|
|
|
4
|
+
import cachetic
|
|
4
5
|
import pydantic
|
|
5
6
|
import pydantic_settings
|
|
7
|
+
import redis
|
|
6
8
|
import yarl
|
|
7
9
|
from str_or_none import str_or_none
|
|
8
10
|
|
|
@@ -13,6 +15,7 @@ if typing.TYPE_CHECKING:
|
|
|
13
15
|
class Settings(pydantic_settings.BaseSettings):
|
|
14
16
|
WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
|
|
15
17
|
WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
|
|
18
|
+
MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
|
|
16
19
|
|
|
17
20
|
@pydantic.model_validator(mode="after")
|
|
18
21
|
def validate_values(self) -> typing.Self:
|
|
@@ -28,6 +31,17 @@ class Settings(pydantic_settings.BaseSettings):
|
|
|
28
31
|
|
|
29
32
|
return WebQueueClient()
|
|
30
33
|
|
|
34
|
+
@functools.cached_property
|
|
35
|
+
def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
|
|
36
|
+
return cachetic.Cachetic(
|
|
37
|
+
object_type=pydantic.TypeAdapter(typing.Text),
|
|
38
|
+
cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
|
|
39
|
+
default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
|
|
40
|
+
)
|
|
41
|
+
|
|
31
42
|
@property
|
|
32
43
|
def web_queue_safe_url(self) -> str:
|
|
33
44
|
return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
|
|
45
|
+
|
|
46
|
+
def get_message_cache_key(self, message_id: str) -> str:
|
|
47
|
+
return f"{self.WEB_QUEUE_NAME}:message:{message_id}"
|
|
@@ -14,6 +14,7 @@ class MessageStatus(enum.StrEnum):
|
|
|
14
14
|
|
|
15
15
|
class Message(pydantic.BaseModel):
|
|
16
16
|
id: str | None = None
|
|
17
|
+
message: str = ""
|
|
17
18
|
data: typing.Any
|
|
18
19
|
status: MessageStatus = pydantic.Field(default=MessageStatus.PENDING)
|
|
19
20
|
total_steps: int = pydantic.Field(default=100)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.1.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|