PyPI - web-queue2 - Versions diffs - 0.1.0__tar.gz → 0.2.0__tar.gz - Mend

web-queue2 0.1.0tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{web_queue2-0.1.0 → web_queue2-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: web-queue2
-Version: 0.1.0
+Version: 0.2.0
 Summary: Get web content from queue.
 License: MIT
 License-File: LICENSE
@@ -19,7 +19,7 @@ Requires-Dist: dictpress
 Requires-Dist: fastapi
 Requires-Dist: httpx
 Requires-Dist: huey
-Requires-Dist: logfire
+Requires-Dist: logfire[redis]
 Requires-Dist: logging_bullet_train
 Requires-Dist: openai (>=1,<2)
 Requires-Dist: openai-agents (>=0.1.0,<1.0.0)

{web_queue2-0.1.0 → web_queue2-0.2.0}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ dependencies = [
   "fastapi",
   "httpx",
   "huey",
-  "logfire",
+  "logfire[redis]",
   "logging_bullet_train",
   "openai (>=1,<2)",
   "openai-agents (>=0.1.0,<1.0.0)",
@@ -23,7 +23,7 @@ license = { text = "MIT" }
 name = "web-queue2"
 readme = "README.md"
 requires-python = ">=3.11,<4"
-version = "0.1.0"
+version = "0.2.0"
 [project.urls]
 Homepage = "https://github.com/allen2c/web-queue"

web_queue2-0.2.0/web_queue/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.2.0

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/app.py RENAMED Viewed

@@ -8,6 +8,7 @@ import logging_bullet_train as lbt
 from huey.api import Task
 import web_queue.config
+from web_queue.types.message import MessageStatus
 if typing.TYPE_CHECKING:
     from web_queue.client import WebQueueClient
@@ -41,13 +42,30 @@ huey_app = huey.RedisExpireHuey(
 )
 def fetch_html(
     message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
-) -> str:
+) -> typing.Optional[typing.Text]:
     from web_queue.types.fetch_html_message import FetchHTMLMessage
     message = FetchHTMLMessage.from_any(message)
     message.id = task.id
-    logger.info(f"Fetching HTML from {message.data.url}")
+    message.status = MessageStatus.RUNNING
+    wq_cache_key = web_queue_settings.get_message_cache_key(message.id)
+    def update_message_cache(
+        total_steps: int | None = None,
+        completed_steps: int | None = None,
+        message_text: str | None = None,
+    ):
+        if total_steps is not None:
+            message.total_steps = total_steps
+        if completed_steps is not None:
+            message.completed_steps = completed_steps
+        if message_text is not None:
+            message.message = message_text
+        web_queue_settings.message_cache.set(wq_cache_key, message.model_dump_json())
+    logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
+    update_message_cache(message_text="Starting to fetch HTML...")
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
@@ -55,9 +73,19 @@ def fetch_html(
     try:
         wq_client: "WebQueueClient" = web_queue_settings.web_queue_client
         html_content: "HTMLContent" = loop.run_until_complete(
-            wq_client.fetch(**message.data.model_dump())
+            wq_client.fetch(
+                **message.data.model_dump(), step_callback=update_message_cache
+            )
         )
+        update_message_cache(100, 100, "Finished fetching HTML.")
         return html_content.model_dump_json()
+    except Exception as e:
+        logger.exception(e)
+        logger.error(f"Failed to fetch HTML: {e}")
+        update_message_cache(message_text=f"Failed to fetch HTML: {e}")
     finally:
         loop.close()
+    return None

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/_client.py RENAMED Viewed

@@ -10,6 +10,7 @@ if typing.TYPE_CHECKING:
     from web_queue.client.config import Settings
     from web_queue.client.web import Web
     from web_queue.types.html_content import HTMLContent
+    from web_queue.types.step_callback import StepCallbackType
 class WebQueueClient:
@@ -46,6 +47,7 @@ class WebQueueClient:
         scrolling_times: int = 3,
         human_delay_base_delay: float = 1.2,
         dynamic_content_loading_delay: float = 2.0,
+        step_callback: typing.Optional["StepCallbackType"] = None,
     ) -> "HTMLContent":
         from web_queue.types.html_content import HTMLContent
         from web_queue.utils.html_to_str import htmls_to_str
@@ -59,13 +61,16 @@ class WebQueueClient:
             scrolling_times=scrolling_times,
             human_delay_base_delay=human_delay_base_delay,
             dynamic_content_loading_delay=dynamic_content_loading_delay,
+            step_callback=step_callback,
         )
         # Clean HTML
         html = self.clean.as_main_content(html)
         # Extract content metadata
-        html_metadata = await self.ai.as_html_metadata(html)
+        html_metadata = await self.ai.as_html_metadata(
+            html, step_callback=step_callback
+        )
         if not html_metadata:
             raise ValueError(f"Failed to retrieve content metadata for url: {url}")

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/ai/_ai.py RENAMED Viewed

@@ -11,6 +11,7 @@ from rich.pretty import pretty_repr
 from web_queue.client import WebQueueClient
 from web_queue.types.html_metadata_response import HTMLMetadataResponse
+from web_queue.types.step_callback import StepCallbackType
 from web_queue.utils.compression import compress, decompress
 if typing.TYPE_CHECKING:
@@ -25,7 +26,9 @@ class AI:
     @logfire.instrument
     async def as_html_metadata(
-        self, html: typing.Union["bs4.BeautifulSoup", typing.Text]
+        self,
+        html: typing.Union["bs4.BeautifulSoup", typing.Text],
+        step_callback: typing.Optional[StepCallbackType] = None,
     ) -> typing.Optional[HTMLMetadataResponse]:
         """Extract content metadata and CSS selector from HTML.
@@ -36,7 +39,10 @@ class AI:
         html = str(html)
-        logger.info(f"AI is extracting content metadata from HTML: {html}")
+        logger.info(
+            "AI is extracting content metadata from HTML: "
+            + f"{pretty_repr(str(html), max_string=64)}"
+        )
         cache_key = (
             "retrieve_html_content_metadata:"
@@ -103,6 +109,9 @@ class AI:
             """  # noqa: E501
         ).strip()
+        if step_callback:
+            step_callback(100, 75, "Starting to extract content metadata...")
         try:
             parsed_cmpl = await openai_client.chat.completions.parse(
                 messages=[
@@ -129,6 +138,9 @@ class AI:
                     compress(output.model_dump_json()),
                 )
+                if step_callback:
+                    step_callback(100, 90, "Finished extracting content metadata.")
                 return output
             else:

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/clean/_clean.py RENAMED Viewed

@@ -2,6 +2,7 @@ import logging
 import typing
 import bs4
+from rich.pretty import pretty_repr
 from web_queue.client import WebQueueClient
 from web_queue.utils.html_cleaner import HTMLCleaner
@@ -20,6 +21,6 @@ class Clean:
             else html
         )
-        logger.info(f"Cleaning HTML: {html}")
+        logger.info(f"Cleaning HTML: {pretty_repr(str(html), max_string=64)}")
         cleaned_html = HTMLCleaner.clean_as_main_content_html_str(html)
         return bs4.BeautifulSoup(cleaned_html, "html.parser")

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/web/_web.py RENAMED Viewed

@@ -14,6 +14,7 @@ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
 from str_or_none import str_or_none
 from web_queue.client import WebQueueClient
+from web_queue.types.step_callback import StepCallbackType
 from web_queue.utils.compression import compress, decompress
 from web_queue.utils.human_delay import human_delay
 from web_queue.utils.page_with_init_script import page_with_init_script
@@ -48,6 +49,7 @@ class Web:
         scrolling_times: int = 3,
         human_delay_base_delay: float = 1.2,
         dynamic_content_loading_delay: float = 2.0,
+        step_callback: typing.Optional[StepCallbackType] = None,
     ) -> bs4.BeautifulSoup:
         _url = str_or_none(str(url))
         if not _url:
@@ -77,6 +79,8 @@ class Web:
                     "--disable-features=VizDisplayCompositor",
                 ],
             )
+            if step_callback:
+                step_callback(100, 15, "Launching browser...")
             # Create context
             _viewport_size = secrets.choice(self.VIEWPORT_SIZES)
@@ -101,6 +105,9 @@ class Web:
             # Inject script to hide automation features
             page = await page_with_init_script(page)
+            if step_callback:
+                step_callback(100, 30, "Navigating to URL...")
             try:
                 # Navigate to URL
                 logger.debug(f"Navigating (timeout: {goto_timeout}ms) to {_url}")
@@ -117,6 +124,9 @@ class Web:
                 await page.wait_for_load_state("domcontentloaded")
                 await human_delay(h_delay)
+                if step_callback:
+                    step_callback(100, 45, "Waiting for full page load...")
                 # Simulate smooth mouse circling three times
                 start_position = None
                 for i in range(circling_times):
@@ -145,6 +155,9 @@ class Web:
                     f"Fetched HTML content size: {html_content_size} for {_url}"
                 )
+                if step_callback:
+                    step_callback(100, 60, "Finished fetching HTML content.")
                 # Screenshot and PDF
                 snapshot_filename = f"{int(time.time()*1E3)}_{secrets.token_hex(2)}"
                 screenshot_path = self.client.settings.web_screenshot_path.joinpath(

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/config.py RENAMED Viewed

@@ -1,8 +1,10 @@
 import functools
 import typing
+import cachetic
 import pydantic
 import pydantic_settings
+import redis
 import yarl
 from str_or_none import str_or_none
@@ -13,6 +15,7 @@ if typing.TYPE_CHECKING:
 class Settings(pydantic_settings.BaseSettings):
     WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
     WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
+    MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24)  # 1 day
     @pydantic.model_validator(mode="after")
     def validate_values(self) -> typing.Self:
@@ -28,6 +31,17 @@ class Settings(pydantic_settings.BaseSettings):
         return WebQueueClient()
+    @functools.cached_property
+    def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
+        return cachetic.Cachetic(
+            object_type=pydantic.TypeAdapter(typing.Text),
+            cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
+            default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
+        )
     @property
     def web_queue_safe_url(self) -> str:
         return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
+    def get_message_cache_key(self, message_id: str) -> str:
+        return f"{self.WEB_QUEUE_NAME}:message:{message_id}"

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/message.py RENAMED Viewed

@@ -14,6 +14,7 @@ class MessageStatus(enum.StrEnum):
 class Message(pydantic.BaseModel):
     id: str | None = None
+    message: str = ""
     data: typing.Any
     status: MessageStatus = pydantic.Field(default=MessageStatus.PENDING)
     total_steps: int = pydantic.Field(default=100)

web_queue2-0.2.0/web_queue/types/step_callback.py ADDED Viewed

@@ -0,0 +1,10 @@
+import typing
+StepCallbackType: typing.TypeAlias = typing.Callable[
+    [
+        typing.Annotated[int, "total_steps"],
+        typing.Annotated[int, "completed_steps"],
+        typing.Annotated[str, "message"],
+    ],
+    None,
+]

web_queue2-0.1.0/web_queue/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.1.0

{web_queue2-0.1.0 → web_queue2-0.2.0}/LICENSE RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/README.md RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/ai/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/clean/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/config.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/client/web/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/fetch_html_message.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/html_content.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/types/html_metadata_response.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/__init__.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/compression.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/html_cleaner.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/html_to_str.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/human_delay.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/page_with_init_script.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/simulate_mouse_circling.py RENAMED Viewed

File without changes

{web_queue2-0.1.0 → web_queue2-0.2.0}/web_queue/utils/simulate_scrolling.py RENAMED Viewed

File without changes

web-queue2 0.1.0__tar.gz → 0.2.0__tar.gz

web-queue2 0.1.0tar.gz → 0.2.0tar.gz