web-queue2 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {web_queue2-0.1.0 → web_queue2-0.2.0}/PKG-INFO +2 -2
  2. {web_queue2-0.1.0 → web_queue2-0.2.0}/pyproject.toml +2 -2
  3. web_queue2-0.2.0/web_queue/VERSION +1 -0
  4. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/app.py +32 -4
  5. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/_client.py +6 -1
  6. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/ai/_ai.py +14 -2
  7. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/clean/_clean.py +2 -1
  8. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/web/_web.py +13 -0
  9. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/config.py +14 -0
  10. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/message.py +1 -0
  11. web_queue2-0.2.0/web_queue/types/step_callback.py +10 -0
  12. web_queue2-0.1.0/web_queue/VERSION +0 -1
  13. {web_queue2-0.1.0 → web_queue2-0.2.0}/LICENSE +0 -0
  14. {web_queue2-0.1.0 → web_queue2-0.2.0}/README.md +0 -0
  15. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/__init__.py +0 -0
  16. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/__init__.py +0 -0
  17. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/ai/__init__.py +0 -0
  18. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/clean/__init__.py +0 -0
  19. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/config.py +0 -0
  20. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/web/__init__.py +0 -0
  21. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/__init__.py +0 -0
  22. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/fetch_html_message.py +0 -0
  23. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/html_content.py +0 -0
  24. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/html_metadata_response.py +0 -0
  25. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/__init__.py +0 -0
  26. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/compression.py +0 -0
  27. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/html_cleaner.py +0 -0
  28. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/html_to_str.py +0 -0
  29. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/human_delay.py +0 -0
  30. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/page_with_init_script.py +0 -0
  31. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/simulate_mouse_circling.py +0 -0
  32. {web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/simulate_scrolling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-queue2
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Get web content from queue.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -19,7 +19,7 @@ Requires-Dist: dictpress
19
19
  Requires-Dist: fastapi
20
20
  Requires-Dist: httpx
21
21
  Requires-Dist: huey
22
- Requires-Dist: logfire
22
+ Requires-Dist: logfire[redis]
23
23
  Requires-Dist: logging_bullet_train
24
24
  Requires-Dist: openai (>=1,<2)
25
25
  Requires-Dist: openai-agents (>=0.1.0,<1.0.0)
@@ -7,7 +7,7 @@ dependencies = [
7
7
  "fastapi",
8
8
  "httpx",
9
9
  "huey",
10
- "logfire",
10
+ "logfire[redis]",
11
11
  "logging_bullet_train",
12
12
  "openai (>=1,<2)",
13
13
  "openai-agents (>=0.1.0,<1.0.0)",
@@ -23,7 +23,7 @@ license = { text = "MIT" }
23
23
  name = "web-queue2"
24
24
  readme = "README.md"
25
25
  requires-python = ">=3.11,<4"
26
- version = "0.1.0"
26
+ version = "0.2.0"
27
27
 
28
28
  [project.urls]
29
29
  Homepage = "https://github.com/allen2c/web-queue"
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -8,6 +8,7 @@ import logging_bullet_train as lbt
8
8
  from huey.api import Task
9
9
 
10
10
  import web_queue.config
11
+ from web_queue.types.message import MessageStatus
11
12
 
12
13
  if typing.TYPE_CHECKING:
13
14
  from web_queue.client import WebQueueClient
@@ -41,13 +42,30 @@ huey_app = huey.RedisExpireHuey(
41
42
  )
42
43
  def fetch_html(
43
44
  message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
44
- ) -> str:
45
+ ) -> typing.Optional[typing.Text]:
45
46
  from web_queue.types.fetch_html_message import FetchHTMLMessage
46
47
 
47
48
  message = FetchHTMLMessage.from_any(message)
48
49
  message.id = task.id
49
-
50
- logger.info(f"Fetching HTML from {message.data.url}")
50
+ message.status = MessageStatus.RUNNING
51
+
52
+ wq_cache_key = web_queue_settings.get_message_cache_key(message.id)
53
+
54
+ def update_message_cache(
55
+ total_steps: int | None = None,
56
+ completed_steps: int | None = None,
57
+ message_text: str | None = None,
58
+ ):
59
+ if total_steps is not None:
60
+ message.total_steps = total_steps
61
+ if completed_steps is not None:
62
+ message.completed_steps = completed_steps
63
+ if message_text is not None:
64
+ message.message = message_text
65
+ web_queue_settings.message_cache.set(wq_cache_key, message.model_dump_json())
66
+
67
+ logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
68
+ update_message_cache(message_text="Starting to fetch HTML...")
51
69
 
52
70
  loop = asyncio.new_event_loop()
53
71
  asyncio.set_event_loop(loop)
@@ -55,9 +73,19 @@ def fetch_html(
55
73
  try:
56
74
  wq_client: "WebQueueClient" = web_queue_settings.web_queue_client
57
75
  html_content: "HTMLContent" = loop.run_until_complete(
58
- wq_client.fetch(**message.data.model_dump())
76
+ wq_client.fetch(
77
+ **message.data.model_dump(), step_callback=update_message_cache
78
+ )
59
79
  )
80
+ update_message_cache(100, 100, "Finished fetching HTML.")
60
81
  return html_content.model_dump_json()
61
82
 
83
+ except Exception as e:
84
+ logger.exception(e)
85
+ logger.error(f"Failed to fetch HTML: {e}")
86
+ update_message_cache(message_text=f"Failed to fetch HTML: {e}")
87
+
62
88
  finally:
63
89
  loop.close()
90
+
91
+ return None
@@ -10,6 +10,7 @@ if typing.TYPE_CHECKING:
10
10
  from web_queue.client.config import Settings
11
11
  from web_queue.client.web import Web
12
12
  from web_queue.types.html_content import HTMLContent
13
+ from web_queue.types.step_callback import StepCallbackType
13
14
 
14
15
 
15
16
  class WebQueueClient:
@@ -46,6 +47,7 @@ class WebQueueClient:
46
47
  scrolling_times: int = 3,
47
48
  human_delay_base_delay: float = 1.2,
48
49
  dynamic_content_loading_delay: float = 2.0,
50
+ step_callback: typing.Optional["StepCallbackType"] = None,
49
51
  ) -> "HTMLContent":
50
52
  from web_queue.types.html_content import HTMLContent
51
53
  from web_queue.utils.html_to_str import htmls_to_str
@@ -59,13 +61,16 @@ class WebQueueClient:
59
61
  scrolling_times=scrolling_times,
60
62
  human_delay_base_delay=human_delay_base_delay,
61
63
  dynamic_content_loading_delay=dynamic_content_loading_delay,
64
+ step_callback=step_callback,
62
65
  )
63
66
 
64
67
  # Clean HTML
65
68
  html = self.clean.as_main_content(html)
66
69
 
67
70
  # Extract content metadata
68
- html_metadata = await self.ai.as_html_metadata(html)
71
+ html_metadata = await self.ai.as_html_metadata(
72
+ html, step_callback=step_callback
73
+ )
69
74
 
70
75
  if not html_metadata:
71
76
  raise ValueError(f"Failed to retrieve content metadata for url: {url}")
@@ -11,6 +11,7 @@ from rich.pretty import pretty_repr
11
11
 
12
12
  from web_queue.client import WebQueueClient
13
13
  from web_queue.types.html_metadata_response import HTMLMetadataResponse
14
+ from web_queue.types.step_callback import StepCallbackType
14
15
  from web_queue.utils.compression import compress, decompress
15
16
 
16
17
  if typing.TYPE_CHECKING:
@@ -25,7 +26,9 @@ class AI:
25
26
 
26
27
  @logfire.instrument
27
28
  async def as_html_metadata(
28
- self, html: typing.Union["bs4.BeautifulSoup", typing.Text]
29
+ self,
30
+ html: typing.Union["bs4.BeautifulSoup", typing.Text],
31
+ step_callback: typing.Optional[StepCallbackType] = None,
29
32
  ) -> typing.Optional[HTMLMetadataResponse]:
30
33
  """Extract content metadata and CSS selector from HTML.
31
34
 
@@ -36,7 +39,10 @@ class AI:
36
39
 
37
40
  html = str(html)
38
41
 
39
- logger.info(f"AI is extracting content metadata from HTML: {html}")
42
+ logger.info(
43
+ "AI is extracting content metadata from HTML: "
44
+ + f"{pretty_repr(str(html), max_string=64)}"
45
+ )
40
46
 
41
47
  cache_key = (
42
48
  "retrieve_html_content_metadata:"
@@ -103,6 +109,9 @@ class AI:
103
109
  """ # noqa: E501
104
110
  ).strip()
105
111
 
112
+ if step_callback:
113
+ step_callback(100, 75, "Starting to extract content metadata...")
114
+
106
115
  try:
107
116
  parsed_cmpl = await openai_client.chat.completions.parse(
108
117
  messages=[
@@ -129,6 +138,9 @@ class AI:
129
138
  compress(output.model_dump_json()),
130
139
  )
131
140
 
141
+ if step_callback:
142
+ step_callback(100, 90, "Finished extracting content metadata.")
143
+
132
144
  return output
133
145
 
134
146
  else:
@@ -2,6 +2,7 @@ import logging
2
2
  import typing
3
3
 
4
4
  import bs4
5
+ from rich.pretty import pretty_repr
5
6
 
6
7
  from web_queue.client import WebQueueClient
7
8
  from web_queue.utils.html_cleaner import HTMLCleaner
@@ -20,6 +21,6 @@ class Clean:
20
21
  else html
21
22
  )
22
23
 
23
- logger.info(f"Cleaning HTML: {html}")
24
+ logger.info(f"Cleaning HTML: {pretty_repr(str(html), max_string=64)}")
24
25
  cleaned_html = HTMLCleaner.clean_as_main_content_html_str(html)
25
26
  return bs4.BeautifulSoup(cleaned_html, "html.parser")
@@ -14,6 +14,7 @@ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
14
14
  from str_or_none import str_or_none
15
15
 
16
16
  from web_queue.client import WebQueueClient
17
+ from web_queue.types.step_callback import StepCallbackType
17
18
  from web_queue.utils.compression import compress, decompress
18
19
  from web_queue.utils.human_delay import human_delay
19
20
  from web_queue.utils.page_with_init_script import page_with_init_script
@@ -48,6 +49,7 @@ class Web:
48
49
  scrolling_times: int = 3,
49
50
  human_delay_base_delay: float = 1.2,
50
51
  dynamic_content_loading_delay: float = 2.0,
52
+ step_callback: typing.Optional[StepCallbackType] = None,
51
53
  ) -> bs4.BeautifulSoup:
52
54
  _url = str_or_none(str(url))
53
55
  if not _url:
@@ -77,6 +79,8 @@ class Web:
77
79
  "--disable-features=VizDisplayCompositor",
78
80
  ],
79
81
  )
82
+ if step_callback:
83
+ step_callback(100, 15, "Launching browser...")
80
84
 
81
85
  # Create context
82
86
  _viewport_size = secrets.choice(self.VIEWPORT_SIZES)
@@ -101,6 +105,9 @@ class Web:
101
105
  # Inject script to hide automation features
102
106
  page = await page_with_init_script(page)
103
107
 
108
+ if step_callback:
109
+ step_callback(100, 30, "Navigating to URL...")
110
+
104
111
  try:
105
112
  # Navigate to URL
106
113
  logger.debug(f"Navigating (timeout: {goto_timeout}ms) to {_url}")
@@ -117,6 +124,9 @@ class Web:
117
124
  await page.wait_for_load_state("domcontentloaded")
118
125
  await human_delay(h_delay)
119
126
 
127
+ if step_callback:
128
+ step_callback(100, 45, "Waiting for full page load...")
129
+
120
130
  # Simulate smooth mouse circling three times
121
131
  start_position = None
122
132
  for i in range(circling_times):
@@ -145,6 +155,9 @@ class Web:
145
155
  f"Fetched HTML content size: {html_content_size} for {_url}"
146
156
  )
147
157
 
158
+ if step_callback:
159
+ step_callback(100, 60, "Finished fetching HTML content.")
160
+
148
161
  # Screenshot and PDF
149
162
  snapshot_filename = f"{int(time.time()*1E3)}_{secrets.token_hex(2)}"
150
163
  screenshot_path = self.client.settings.web_screenshot_path.joinpath(
@@ -1,8 +1,10 @@
1
1
  import functools
2
2
  import typing
3
3
 
4
+ import cachetic
4
5
  import pydantic
5
6
  import pydantic_settings
7
+ import redis
6
8
  import yarl
7
9
  from str_or_none import str_or_none
8
10
 
@@ -13,6 +15,7 @@ if typing.TYPE_CHECKING:
13
15
  class Settings(pydantic_settings.BaseSettings):
14
16
  WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
15
17
  WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
18
+ MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
16
19
 
17
20
  @pydantic.model_validator(mode="after")
18
21
  def validate_values(self) -> typing.Self:
@@ -28,6 +31,17 @@ class Settings(pydantic_settings.BaseSettings):
28
31
 
29
32
  return WebQueueClient()
30
33
 
34
+ @functools.cached_property
35
+ def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
36
+ return cachetic.Cachetic(
37
+ object_type=pydantic.TypeAdapter(typing.Text),
38
+ cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
39
+ default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
40
+ )
41
+
31
42
  @property
32
43
  def web_queue_safe_url(self) -> str:
33
44
  return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
45
+
46
+ def get_message_cache_key(self, message_id: str) -> str:
47
+ return f"{self.WEB_QUEUE_NAME}:message:{message_id}"
@@ -14,6 +14,7 @@ class MessageStatus(enum.StrEnum):
14
14
 
15
15
  class Message(pydantic.BaseModel):
16
16
  id: str | None = None
17
+ message: str = ""
17
18
  data: typing.Any
18
19
  status: MessageStatus = pydantic.Field(default=MessageStatus.PENDING)
19
20
  total_steps: int = pydantic.Field(default=100)
@@ -0,0 +1,10 @@
1
+ import typing
2
+
3
+ StepCallbackType: typing.TypeAlias = typing.Callable[
4
+ [
5
+ typing.Annotated[int, "total_steps"],
6
+ typing.Annotated[int, "completed_steps"],
7
+ typing.Annotated[str, "message"],
8
+ ],
9
+ None,
10
+ ]
@@ -1 +0,0 @@
1
- 0.1.0
File without changes
File without changes