web-queue2 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {web_queue2-0.2.0 → web_queue2-0.3.0}/PKG-INFO +3 -1
  2. {web_queue2-0.2.0 → web_queue2-0.3.0}/pyproject.toml +3 -1
  3. web_queue2-0.3.0/web_queue/VERSION +1 -0
  4. web_queue2-0.3.0/web_queue/app.py +127 -0
  5. web_queue2-0.3.0/web_queue/client/__init__.py +4 -0
  6. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/_client.py +9 -2
  7. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/ai/_ai.py +16 -4
  8. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/config.py +31 -0
  9. web_queue2-0.3.0/web_queue/client/messages/__init__.py +3 -0
  10. web_queue2-0.3.0/web_queue/client/messages/_messages.py +116 -0
  11. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/web/_web.py +30 -6
  12. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/message.py +13 -1
  13. web_queue2-0.3.0/web_queue/types/model_var.py +5 -0
  14. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/html_to_str.py +3 -8
  15. web_queue2-0.2.0/web_queue/VERSION +0 -1
  16. web_queue2-0.2.0/web_queue/app.py +0 -91
  17. web_queue2-0.2.0/web_queue/client/__init__.py +0 -3
  18. web_queue2-0.2.0/web_queue/config.py +0 -47
  19. web_queue2-0.2.0/web_queue/types/step_callback.py +0 -10
  20. {web_queue2-0.2.0 → web_queue2-0.3.0}/LICENSE +0 -0
  21. {web_queue2-0.2.0 → web_queue2-0.3.0}/README.md +0 -0
  22. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/__init__.py +0 -0
  23. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/ai/__init__.py +0 -0
  24. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/clean/__init__.py +0 -0
  25. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/clean/_clean.py +0 -0
  26. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/client/web/__init__.py +0 -0
  27. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/__init__.py +0 -0
  28. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/fetch_html_message.py +0 -0
  29. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/html_content.py +0 -0
  30. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/types/html_metadata_response.py +0 -0
  31. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/__init__.py +0 -0
  32. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/compression.py +0 -0
  33. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/html_cleaner.py +0 -0
  34. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/human_delay.py +0 -0
  35. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/page_with_init_script.py +0 -0
  36. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/simulate_mouse_circling.py +0 -0
  37. {web_queue2-0.2.0 → web_queue2-0.3.0}/web_queue/utils/simulate_scrolling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: web-queue2
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Get web content from queue.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -17,6 +17,7 @@ Requires-Dist: bs4
17
17
  Requires-Dist: cachetic
18
18
  Requires-Dist: dictpress
19
19
  Requires-Dist: fastapi
20
+ Requires-Dist: html-to-markdown
20
21
  Requires-Dist: httpx
21
22
  Requires-Dist: huey
22
23
  Requires-Dist: logfire[redis]
@@ -28,6 +29,7 @@ Requires-Dist: pydantic (>=2)
28
29
  Requires-Dist: pydantic-settings
29
30
  Requires-Dist: str-or-none
30
31
  Requires-Dist: tiktoken
32
+ Requires-Dist: typer
31
33
  Requires-Dist: yarl
32
34
  Project-URL: Homepage, https://github.com/allen2c/web-queue
33
35
  Project-URL: PyPI, https://pypi.org/project/web-queue/
@@ -5,6 +5,7 @@ dependencies = [
5
5
  "cachetic",
6
6
  "dictpress",
7
7
  "fastapi",
8
+ "html-to-markdown",
8
9
  "httpx",
9
10
  "huey",
10
11
  "logfire[redis]",
@@ -16,6 +17,7 @@ dependencies = [
16
17
  "pydantic-settings",
17
18
  "str-or-none",
18
19
  "tiktoken",
20
+ "typer",
19
21
  "yarl",
20
22
  ]
21
23
  description = "Get web content from queue."
@@ -23,7 +25,7 @@ license = { text = "MIT" }
23
25
  name = "web-queue2"
24
26
  readme = "README.md"
25
27
  requires-python = ">=3.11,<4"
26
- version = "0.2.0"
28
+ version = "0.3.0"
27
29
 
28
30
  [project.urls]
29
31
  Homepage = "https://github.com/allen2c/web-queue"
@@ -0,0 +1 @@
1
+ 0.3.0
@@ -0,0 +1,127 @@
1
+ import asyncio
2
+ import logging
3
+ import typing
4
+
5
+ import fastapi
6
+ import huey
7
+ import huey.exceptions
8
+ import logfire
9
+ import logging_bullet_train as lbt
10
+ from huey.api import Task
11
+
12
+ from web_queue.client import Settings, WebQueueClient
13
+ from web_queue.types.fetch_html_message import FetchHTMLMessage
14
+ from web_queue.types.html_content import HTMLContent
15
+ from web_queue.types.message import MessageStatus, MessageUpdate
16
+ from web_queue.types.model_var import ModelVar
17
+
18
+ lbt.set_logger("web_queue")
19
+
20
+ logfire.configure()
21
+ logfire.instrument_openai()
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ logger.info("Web queue app starting...")
26
+
27
+ wq_settings = Settings()
28
+ wq_client = WebQueueClient(wq_settings)
29
+ logger.info(f"Web queue connecting to redis: {wq_settings.web_queue_safe_url}")
30
+
31
+ huey_app = huey.RedisExpireHuey(
32
+ wq_settings.WEB_QUEUE_NAME,
33
+ url=wq_settings.WEB_QUEUE_URL.get_secret_value(),
34
+ expire_time=24 * 60 * 60, # 24 hours
35
+ )
36
+
37
+
38
+ def retrieve_result(task_id: str) -> typing.Optional[typing.Text]:
39
+ try:
40
+ result: str | None = huey_app.result(
41
+ task_id,
42
+ blocking=True,
43
+ timeout=2,
44
+ ) # type: ignore
45
+ except huey.exceptions.ResultTimeout:
46
+ logger.error(f"Timeout waiting for result for task {task_id}")
47
+ return None
48
+ if result is None:
49
+ logger.error(f"No result found for task {task_id}")
50
+ return result
51
+
52
+
53
+ def retrieve_result_as(task_id: str, model: typing.Type[ModelVar]) -> ModelVar:
54
+ result = retrieve_result(task_id)
55
+ if result is None:
56
+ raise fastapi.HTTPException(
57
+ status_code=404, detail=f"No result found for task {task_id}"
58
+ )
59
+ return model.model_validate_json(result)
60
+
61
+
62
+ @huey_app.task(
63
+ retries=1,
64
+ retry_delay=8,
65
+ expires=24 * 60 * 60,
66
+ context=True,
67
+ )
68
+ def fetch_html(
69
+ message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
70
+ ) -> typing.Optional[typing.Text]:
71
+ from web_queue.types.fetch_html_message import FetchHTMLMessage
72
+
73
+ global wq_client
74
+
75
+ message = FetchHTMLMessage.from_any(message)
76
+ message.id = task.id
77
+ message.status = MessageStatus.RUNNING
78
+
79
+ loop = asyncio.new_event_loop()
80
+ asyncio.set_event_loop(loop)
81
+
82
+ update_message_func = wq_client.messages.wrap_update_message(message.id, message)
83
+
84
+ try:
85
+ logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
86
+ update_message_func(
87
+ MessageUpdate(
88
+ total_steps=100,
89
+ completed_steps=0,
90
+ status=MessageStatus.RUNNING,
91
+ message_text="Starting to fetch HTML...",
92
+ )
93
+ )
94
+
95
+ html_content: "HTMLContent" = loop.run_until_complete(
96
+ wq_client.fetch(
97
+ **message.data.model_dump(), step_callback=update_message_func
98
+ )
99
+ )
100
+
101
+ update_message_func(
102
+ MessageUpdate(
103
+ total_steps=100,
104
+ completed_steps=100,
105
+ status=MessageStatus.COMPLETED,
106
+ message_text="Finished fetching HTML",
107
+ )
108
+ )
109
+
110
+ return html_content.model_dump_json()
111
+
112
+ except Exception as e:
113
+ logger.exception(e)
114
+ logger.error(f"Failed to fetch HTML: {e}")
115
+ update_message_func(
116
+ MessageUpdate(
117
+ total_steps=100,
118
+ completed_steps=100,
119
+ status=MessageStatus.FAILED,
120
+ message_text=f"Failed to fetch HTML: {e}",
121
+ )
122
+ )
123
+
124
+ finally:
125
+ loop.close()
126
+
127
+ return None
@@ -0,0 +1,4 @@
1
+ from web_queue.client._client import WebQueueClient
2
+ from web_queue.client.config import Settings
3
+
4
+ __all__ = ["WebQueueClient", "Settings"]
@@ -8,9 +8,10 @@ if typing.TYPE_CHECKING:
8
8
  from web_queue.client.ai import AI
9
9
  from web_queue.client.clean import Clean
10
10
  from web_queue.client.config import Settings
11
+ from web_queue.client.messages import Messages
11
12
  from web_queue.client.web import Web
12
13
  from web_queue.types.html_content import HTMLContent
13
- from web_queue.types.step_callback import StepCallbackType
14
+ from web_queue.types.message import MessageUpdate
14
15
 
15
16
 
16
17
  class WebQueueClient:
@@ -37,6 +38,12 @@ class WebQueueClient:
37
38
 
38
39
  return AI(self)
39
40
 
41
+ @functools.cached_property
42
+ def messages(self) -> "Messages":
43
+ from web_queue.client.messages import Messages
44
+
45
+ return Messages(self)
46
+
40
47
  async def fetch(
41
48
  self,
42
49
  url: yarl.URL | httpx.URL | str,
@@ -47,7 +54,7 @@ class WebQueueClient:
47
54
  scrolling_times: int = 3,
48
55
  human_delay_base_delay: float = 1.2,
49
56
  dynamic_content_loading_delay: float = 2.0,
50
- step_callback: typing.Optional["StepCallbackType"] = None,
57
+ step_callback: typing.Optional[typing.Callable[["MessageUpdate"], None]] = None,
51
58
  ) -> "HTMLContent":
52
59
  from web_queue.types.html_content import HTMLContent
53
60
  from web_queue.utils.html_to_str import htmls_to_str
@@ -11,7 +11,7 @@ from rich.pretty import pretty_repr
11
11
 
12
12
  from web_queue.client import WebQueueClient
13
13
  from web_queue.types.html_metadata_response import HTMLMetadataResponse
14
- from web_queue.types.step_callback import StepCallbackType
14
+ from web_queue.types.message import MessageUpdate
15
15
  from web_queue.utils.compression import compress, decompress
16
16
 
17
17
  if typing.TYPE_CHECKING:
@@ -28,7 +28,7 @@ class AI:
28
28
  async def as_html_metadata(
29
29
  self,
30
30
  html: typing.Union["bs4.BeautifulSoup", typing.Text],
31
- step_callback: typing.Optional[StepCallbackType] = None,
31
+ step_callback: typing.Optional[typing.Callable[["MessageUpdate"], None]] = None,
32
32
  ) -> typing.Optional[HTMLMetadataResponse]:
33
33
  """Extract content metadata and CSS selector from HTML.
34
34
 
@@ -110,7 +110,13 @@ class AI:
110
110
  ).strip()
111
111
 
112
112
  if step_callback:
113
- step_callback(100, 75, "Starting to extract content metadata...")
113
+ step_callback(
114
+ MessageUpdate(
115
+ total_steps=100,
116
+ completed_steps=75,
117
+ message_text="Starting to extract content metadata...",
118
+ )
119
+ )
114
120
 
115
121
  try:
116
122
  parsed_cmpl = await openai_client.chat.completions.parse(
@@ -139,7 +145,13 @@ class AI:
139
145
  )
140
146
 
141
147
  if step_callback:
142
- step_callback(100, 90, "Finished extracting content metadata.")
148
+ step_callback(
149
+ MessageUpdate(
150
+ total_steps=100,
151
+ completed_steps=90,
152
+ message_text="Finished extracting content metadata",
153
+ )
154
+ )
143
155
 
144
156
  return output
145
157
 
@@ -6,9 +6,16 @@ import cachetic
6
6
  import openai
7
7
  import pydantic as pydantic
8
8
  import pydantic_settings
9
+ from str_or_none import str_or_none
9
10
 
10
11
 
11
12
  class Settings(pydantic_settings.BaseSettings):
13
+ # Core
14
+ WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
15
+ WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
16
+ MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
17
+
18
+ # AI
12
19
  OPENAI_MODEL: str = pydantic.Field(default="gpt-4.1-nano")
13
20
  OPENAI_API_KEY: pydantic.SecretStr = pydantic.SecretStr("")
14
21
 
@@ -24,6 +31,24 @@ class Settings(pydantic_settings.BaseSettings):
24
31
  default=60 * 60 * 24
25
32
  ) # 1 day
26
33
 
34
+ @pydantic.model_validator(mode="after")
35
+ def validate_values(self) -> typing.Self:
36
+ if str_or_none(self.WEB_QUEUE_NAME) is None:
37
+ raise ValueError("WEB_QUEUE_NAME is required")
38
+ if str_or_none(self.WEB_QUEUE_URL.get_secret_value()) is None:
39
+ raise ValueError("WEB_QUEUE_URL is required")
40
+ return self
41
+
42
+ @functools.cached_property
43
+ def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
44
+ import redis
45
+
46
+ return cachetic.Cachetic(
47
+ object_type=pydantic.TypeAdapter(typing.Text),
48
+ cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
49
+ default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
50
+ )
51
+
27
52
  @functools.cached_property
28
53
  def openai_client(self) -> openai.AsyncOpenAI:
29
54
  return openai.AsyncOpenAI(api_key=self.OPENAI_API_KEY.get_secret_value())
@@ -48,6 +73,12 @@ class Settings(pydantic_settings.BaseSettings):
48
73
  default_ttl=self.COMPRESSED_BASE64_CACHE_EXPIRE_SECONDS,
49
74
  )
50
75
 
76
+ @property
77
+ def web_queue_safe_url(self) -> str:
78
+ import yarl
79
+
80
+ return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
81
+
51
82
  @property
52
83
  def web_screenshot_path(self) -> pathlib.Path:
53
84
  _path = pathlib.Path(self.WEB_SCREENSHOT_PATH)
@@ -0,0 +1,3 @@
1
+ from web_queue.client.messages._messages import Messages
2
+
3
+ __all__ = ["Messages"]
@@ -0,0 +1,116 @@
1
+ import time
2
+ import typing
3
+
4
+ import fastapi
5
+
6
+ from web_queue.client import WebQueueClient
7
+ from web_queue.types.message import Message, MessageStatus, MessageUpdate, MessageVar
8
+
9
+
10
+ class Messages:
11
+ def __init__(self, client: WebQueueClient):
12
+ self.client = client
13
+
14
+ def get(self, message_id: str, *, timeout: float = 10.0) -> typing.Optional[str]:
15
+ ts = time.perf_counter()
16
+ while time.perf_counter() - ts < timeout:
17
+ message_cache_key = self.get_cache_key(message_id)
18
+ maybe_json = self.client.settings.message_cache.get(message_cache_key)
19
+ if maybe_json is not None:
20
+ break
21
+ else:
22
+ time.sleep(0.1)
23
+
24
+ if maybe_json is None:
25
+ return None
26
+ return maybe_json
27
+
28
+ def retrieve(self, message_id: str, *, timeout: float = 10.0) -> str:
29
+ json_data = self.get(message_id, timeout=timeout)
30
+ if json_data is None:
31
+ raise fastapi.HTTPException(status_code=404, detail="Message not found")
32
+ return json_data
33
+
34
+ def retrieve_as(
35
+ self, message_id: str, model: typing.Type[MessageVar], *, timeout: float = 10.0
36
+ ) -> MessageVar:
37
+ json_data = self.retrieve(message_id, timeout=timeout)
38
+ return model.model_validate_json(json_data)
39
+
40
+ def set(self, message_id: str, message: Message) -> None:
41
+ message_cache_key = self.get_cache_key(message_id)
42
+ self.client.settings.message_cache.set(
43
+ message_cache_key, message.model_dump_json()
44
+ )
45
+
46
+ def update(
47
+ self,
48
+ message_id: str,
49
+ message_update: MessageUpdate,
50
+ ) -> Message:
51
+ message = self.retrieve_as(message_id, Message)
52
+ if message_update.message_text is not None:
53
+ message.message_text = message_update.message_text
54
+ if message_update.data is not None:
55
+ message.data = message_update.data
56
+ if message_update.status is not None:
57
+ message.status = message_update.status
58
+ if message_update.total_steps is not None:
59
+ message.total_steps = message_update.total_steps
60
+ if message_update.completed_steps is not None:
61
+ message.completed_steps = message_update.completed_steps
62
+ if message_update.error is not None:
63
+ message.error = message_update.error
64
+ self.set(message_id, message)
65
+
66
+ return message
67
+
68
+ def wrap_update_message(
69
+ self, message_id: str, message: Message
70
+ ) -> typing.Callable[[MessageUpdate], None]:
71
+ def _update(message_update: MessageUpdate) -> None:
72
+ if message_update.message_text is not None:
73
+ message.message_text = message_update.message_text
74
+ if message_update.data is not None:
75
+ message.data = message_update.data
76
+ if message_update.status is not None:
77
+ message.status = message_update.status
78
+ if message_update.total_steps is not None:
79
+ message.total_steps = message_update.total_steps
80
+ if message_update.completed_steps is not None:
81
+ message.completed_steps = message_update.completed_steps
82
+ if message_update.error is not None:
83
+ message.error = message_update.error
84
+ self.set(message_id, message)
85
+
86
+ return _update
87
+
88
+ def poll_util_done(
89
+ self,
90
+ message_id: str,
91
+ *,
92
+ timeout: float = 60.0,
93
+ model: typing.Type[MessageVar],
94
+ delay: float = 0.2,
95
+ ) -> MessageVar:
96
+ ts = time.perf_counter()
97
+ msg: MessageVar | None = None
98
+
99
+ while is_timeout := (time.perf_counter() - ts < timeout):
100
+ msg = self.retrieve_as(message_id, model)
101
+ if msg.status in [MessageStatus.COMPLETED, MessageStatus.FAILED]:
102
+ break
103
+ time.sleep(delay)
104
+
105
+ if msg is None:
106
+ raise fastapi.HTTPException(status_code=404, detail="Message not found")
107
+
108
+ if is_timeout:
109
+ raise fastapi.HTTPException(
110
+ status_code=408, detail="Timeout waiting for message to be done"
111
+ )
112
+
113
+ return msg
114
+
115
+ def get_cache_key(self, message_id: str) -> str:
116
+ return f"{self.client.settings.WEB_QUEUE_NAME}:message:{message_id}"
@@ -14,7 +14,7 @@ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
14
14
  from str_or_none import str_or_none
15
15
 
16
16
  from web_queue.client import WebQueueClient
17
- from web_queue.types.step_callback import StepCallbackType
17
+ from web_queue.types.message import MessageUpdate
18
18
  from web_queue.utils.compression import compress, decompress
19
19
  from web_queue.utils.human_delay import human_delay
20
20
  from web_queue.utils.page_with_init_script import page_with_init_script
@@ -49,7 +49,7 @@ class Web:
49
49
  scrolling_times: int = 3,
50
50
  human_delay_base_delay: float = 1.2,
51
51
  dynamic_content_loading_delay: float = 2.0,
52
- step_callback: typing.Optional[StepCallbackType] = None,
52
+ step_callback: typing.Optional[typing.Callable[["MessageUpdate"], None]] = None,
53
53
  ) -> bs4.BeautifulSoup:
54
54
  _url = str_or_none(str(url))
55
55
  if not _url:
@@ -80,7 +80,13 @@ class Web:
80
80
  ],
81
81
  )
82
82
  if step_callback:
83
- step_callback(100, 15, "Launching browser...")
83
+ step_callback(
84
+ MessageUpdate(
85
+ total_steps=100,
86
+ completed_steps=15,
87
+ message_text="Launching browser...",
88
+ )
89
+ )
84
90
 
85
91
  # Create context
86
92
  _viewport_size = secrets.choice(self.VIEWPORT_SIZES)
@@ -106,7 +112,13 @@ class Web:
106
112
  page = await page_with_init_script(page)
107
113
 
108
114
  if step_callback:
109
- step_callback(100, 30, "Navigating to URL...")
115
+ step_callback(
116
+ MessageUpdate(
117
+ total_steps=100,
118
+ completed_steps=30,
119
+ message_text="Navigating to URL...",
120
+ )
121
+ )
110
122
 
111
123
  try:
112
124
  # Navigate to URL
@@ -125,7 +137,13 @@ class Web:
125
137
  await human_delay(h_delay)
126
138
 
127
139
  if step_callback:
128
- step_callback(100, 45, "Waiting for full page load...")
140
+ step_callback(
141
+ MessageUpdate(
142
+ total_steps=100,
143
+ completed_steps=45,
144
+ message_text="Waiting for full page load...",
145
+ )
146
+ )
129
147
 
130
148
  # Simulate smooth mouse circling three times
131
149
  start_position = None
@@ -156,7 +174,13 @@ class Web:
156
174
  )
157
175
 
158
176
  if step_callback:
159
- step_callback(100, 60, "Finished fetching HTML content.")
177
+ step_callback(
178
+ MessageUpdate(
179
+ total_steps=100,
180
+ completed_steps=60,
181
+ message_text="Finished fetching HTML content",
182
+ )
183
+ )
160
184
 
161
185
  # Screenshot and PDF
162
186
  snapshot_filename = f"{int(time.time()*1E3)}_{secrets.token_hex(2)}"
@@ -14,7 +14,7 @@ class MessageStatus(enum.StrEnum):
14
14
 
15
15
  class Message(pydantic.BaseModel):
16
16
  id: str | None = None
17
- message: str = ""
17
+ message_text: str = ""
18
18
  data: typing.Any
19
19
  status: MessageStatus = pydantic.Field(default=MessageStatus.PENDING)
20
20
  total_steps: int = pydantic.Field(default=100)
@@ -33,3 +33,15 @@ class Message(pydantic.BaseModel):
33
33
  return cls.model_validate_json(any)
34
34
  else:
35
35
  raise ValueError(f"Invalid type: {type(any)}")
36
+
37
+
38
+ MessageVar = typing.TypeVar("MessageVar", bound=Message)
39
+
40
+
41
+ class MessageUpdate(pydantic.BaseModel):
42
+ message_text: typing.Optional[str] = None
43
+ data: typing.Optional[typing.Any] = None
44
+ status: typing.Optional[MessageStatus] = None
45
+ total_steps: typing.Optional[int] = None
46
+ completed_steps: typing.Optional[int] = None
47
+ error: typing.Optional[str] = None
@@ -0,0 +1,5 @@
1
+ import typing
2
+
3
+ import pydantic
4
+
5
+ ModelVar = typing.TypeVar("ModelVar", bound=pydantic.BaseModel)
@@ -1,18 +1,13 @@
1
1
  import typing
2
2
 
3
3
  import bs4
4
+ import html_to_markdown
4
5
 
5
6
 
6
7
  def html_to_str(html: bs4.BeautifulSoup | bs4.Tag | str) -> str:
7
8
  html = bs4.BeautifulSoup(html, "html.parser") if isinstance(html, str) else html
8
-
9
- full_text = ""
10
- for p in html.find_all("p"):
11
- content = p.get_text(separator="\n", strip=True)
12
- full_text += content
13
- full_text += "\n"
14
-
15
- return full_text.strip()
9
+ content = html_to_markdown.convert(str(html)).strip()
10
+ return "\n".join(line.rstrip() for line in content.splitlines())
16
11
 
17
12
 
18
13
  def htmls_to_str(
@@ -1 +0,0 @@
1
- 0.2.0
@@ -1,91 +0,0 @@
1
- import asyncio
2
- import logging
3
- import typing
4
-
5
- import huey
6
- import logfire
7
- import logging_bullet_train as lbt
8
- from huey.api import Task
9
-
10
- import web_queue.config
11
- from web_queue.types.message import MessageStatus
12
-
13
- if typing.TYPE_CHECKING:
14
- from web_queue.client import WebQueueClient
15
- from web_queue.types.fetch_html_message import FetchHTMLMessage
16
- from web_queue.types.html_content import HTMLContent
17
-
18
- lbt.set_logger("web_queue")
19
-
20
- logfire.configure()
21
- logfire.instrument_openai()
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
- logger.info("Web queue app starting...")
26
-
27
- web_queue_settings = web_queue.config.Settings()
28
- logger.info(f"Web queue connecting to redis: {web_queue_settings.web_queue_safe_url}")
29
-
30
- huey_app = huey.RedisExpireHuey(
31
- web_queue_settings.WEB_QUEUE_NAME,
32
- url=web_queue_settings.WEB_QUEUE_URL.get_secret_value(),
33
- expire_time=24 * 60 * 60, # 24 hours
34
- )
35
-
36
-
37
- @huey_app.task(
38
- retries=1,
39
- retry_delay=8,
40
- expires=24 * 60 * 60,
41
- context=True,
42
- )
43
- def fetch_html(
44
- message: typing.Union["FetchHTMLMessage", str, bytes], task: Task
45
- ) -> typing.Optional[typing.Text]:
46
- from web_queue.types.fetch_html_message import FetchHTMLMessage
47
-
48
- message = FetchHTMLMessage.from_any(message)
49
- message.id = task.id
50
- message.status = MessageStatus.RUNNING
51
-
52
- wq_cache_key = web_queue_settings.get_message_cache_key(message.id)
53
-
54
- def update_message_cache(
55
- total_steps: int | None = None,
56
- completed_steps: int | None = None,
57
- message_text: str | None = None,
58
- ):
59
- if total_steps is not None:
60
- message.total_steps = total_steps
61
- if completed_steps is not None:
62
- message.completed_steps = completed_steps
63
- if message_text is not None:
64
- message.message = message_text
65
- web_queue_settings.message_cache.set(wq_cache_key, message.model_dump_json())
66
-
67
- logger.info(f"Fetching HTML with parameters: {message.data.model_dump_json()}")
68
- update_message_cache(message_text="Starting to fetch HTML...")
69
-
70
- loop = asyncio.new_event_loop()
71
- asyncio.set_event_loop(loop)
72
-
73
- try:
74
- wq_client: "WebQueueClient" = web_queue_settings.web_queue_client
75
- html_content: "HTMLContent" = loop.run_until_complete(
76
- wq_client.fetch(
77
- **message.data.model_dump(), step_callback=update_message_cache
78
- )
79
- )
80
- update_message_cache(100, 100, "Finished fetching HTML.")
81
- return html_content.model_dump_json()
82
-
83
- except Exception as e:
84
- logger.exception(e)
85
- logger.error(f"Failed to fetch HTML: {e}")
86
- update_message_cache(message_text=f"Failed to fetch HTML: {e}")
87
-
88
- finally:
89
- loop.close()
90
-
91
- return None
@@ -1,3 +0,0 @@
1
- from web_queue.client._client import WebQueueClient
2
-
3
- __all__ = ["WebQueueClient"]
@@ -1,47 +0,0 @@
1
- import functools
2
- import typing
3
-
4
- import cachetic
5
- import pydantic
6
- import pydantic_settings
7
- import redis
8
- import yarl
9
- from str_or_none import str_or_none
10
-
11
- if typing.TYPE_CHECKING:
12
- from web_queue.client import WebQueueClient
13
-
14
-
15
- class Settings(pydantic_settings.BaseSettings):
16
- WEB_QUEUE_NAME: str = pydantic.Field(default="web-queue")
17
- WEB_QUEUE_URL: pydantic.SecretStr = pydantic.SecretStr("")
18
- MESSAGE_CACHE_EXPIRE_SECONDS: int = pydantic.Field(default=60 * 60 * 24) # 1 day
19
-
20
- @pydantic.model_validator(mode="after")
21
- def validate_values(self) -> typing.Self:
22
- if str_or_none(self.WEB_QUEUE_NAME) is None:
23
- raise ValueError("WEB_QUEUE_NAME is required")
24
- if str_or_none(self.WEB_QUEUE_URL.get_secret_value()) is None:
25
- raise ValueError("WEB_QUEUE_URL is required")
26
- return self
27
-
28
- @functools.cached_property
29
- def web_queue_client(self) -> "WebQueueClient":
30
- from web_queue.client import WebQueueClient
31
-
32
- return WebQueueClient()
33
-
34
- @functools.cached_property
35
- def message_cache(self) -> "cachetic.Cachetic[typing.Text]":
36
- return cachetic.Cachetic(
37
- object_type=pydantic.TypeAdapter(typing.Text),
38
- cache_url=redis.from_url(self.WEB_QUEUE_URL.get_secret_value()),
39
- default_ttl=self.MESSAGE_CACHE_EXPIRE_SECONDS,
40
- )
41
-
42
- @property
43
- def web_queue_safe_url(self) -> str:
44
- return str(yarl.URL(self.WEB_QUEUE_URL.get_secret_value()).with_password("***"))
45
-
46
- def get_message_cache_key(self, message_id: str) -> str:
47
- return f"{self.WEB_QUEUE_NAME}:message:{message_id}"
@@ -1,10 +0,0 @@
1
- import typing
2
-
3
- StepCallbackType: typing.TypeAlias = typing.Callable[
4
- [
5
- typing.Annotated[int, "total_steps"],
6
- typing.Annotated[int, "completed_steps"],
7
- typing.Annotated[str, "message"],
8
- ],
9
- None,
10
- ]
File without changes
File without changes