hyperbrowser 0.23.0__tar.gz → 0.24.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hyperbrowser might be problematic. Click here for more details.
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/PKG-INFO +1 -1
- hyperbrowser-0.24.0/hyperbrowser/client/managers/async_manager/crawl.py +101 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/extract.py +19 -4
- hyperbrowser-0.24.0/hyperbrowser/client/managers/async_manager/scrape.py +150 -0
- hyperbrowser-0.24.0/hyperbrowser/client/managers/sync_manager/crawl.py +102 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/extract.py +18 -4
- hyperbrowser-0.24.0/hyperbrowser/client/managers/sync_manager/scrape.py +148 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/consts.py +2 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/crawl.py +2 -2
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/extract.py +5 -1
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/scrape.py +76 -1
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/pyproject.toml +1 -1
- hyperbrowser-0.23.0/hyperbrowser/client/managers/async_manager/crawl.py +0 -60
- hyperbrowser-0.23.0/hyperbrowser/client/managers/async_manager/scrape.py +0 -36
- hyperbrowser-0.23.0/hyperbrowser/client/managers/sync_manager/crawl.py +0 -60
- hyperbrowser-0.23.0/hyperbrowser/client/managers/sync_manager/scrape.py +0 -36
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/LICENSE +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/README.md +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/__init__.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/async_client.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/base.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/extension.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/profile.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/session.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/extension.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/profile.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/session.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/sync.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/config.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/exceptions.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/extension.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/profile.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/session.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/__init__.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/anthropic.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/openai.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/schema.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/transport/async_transport.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/transport/base.py +0 -0
- {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/transport/sync.py +0 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
4
|
+
from ....models.crawl import (
|
|
5
|
+
CrawlJobResponse,
|
|
6
|
+
GetCrawlJobParams,
|
|
7
|
+
StartCrawlJobParams,
|
|
8
|
+
StartCrawlJobResponse,
|
|
9
|
+
)
|
|
10
|
+
from ....exceptions import HyperbrowserError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CrawlManager:
|
|
14
|
+
def __init__(self, client):
|
|
15
|
+
self._client = client
|
|
16
|
+
|
|
17
|
+
async def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
|
|
18
|
+
response = await self._client.transport.post(
|
|
19
|
+
self._client._build_url("/crawl"),
|
|
20
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
21
|
+
)
|
|
22
|
+
return StartCrawlJobResponse(**response.data)
|
|
23
|
+
|
|
24
|
+
async def get(
|
|
25
|
+
self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
|
|
26
|
+
) -> CrawlJobResponse:
|
|
27
|
+
response = await self._client.transport.get(
|
|
28
|
+
self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
|
|
29
|
+
)
|
|
30
|
+
return CrawlJobResponse(**response.data)
|
|
31
|
+
|
|
32
|
+
async def start_and_wait(
|
|
33
|
+
self, params: StartCrawlJobParams, return_all_pages: bool = True
|
|
34
|
+
) -> CrawlJobResponse:
|
|
35
|
+
job_start_resp = await self.start(params)
|
|
36
|
+
job_id = job_start_resp.job_id
|
|
37
|
+
if not job_id:
|
|
38
|
+
raise HyperbrowserError("Failed to start crawl job")
|
|
39
|
+
|
|
40
|
+
job_response: CrawlJobResponse
|
|
41
|
+
failures = 0
|
|
42
|
+
while True:
|
|
43
|
+
try:
|
|
44
|
+
job_response = await self.get(
|
|
45
|
+
job_id,
|
|
46
|
+
params=GetCrawlJobParams(batch_size=1),
|
|
47
|
+
)
|
|
48
|
+
if (
|
|
49
|
+
job_response.status == "completed"
|
|
50
|
+
or job_response.status == "failed"
|
|
51
|
+
):
|
|
52
|
+
break
|
|
53
|
+
except Exception as e:
|
|
54
|
+
failures += 1
|
|
55
|
+
if failures >= POLLING_ATTEMPTS:
|
|
56
|
+
raise HyperbrowserError(
|
|
57
|
+
f"Failed to poll crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
58
|
+
)
|
|
59
|
+
await asyncio.sleep(2)
|
|
60
|
+
|
|
61
|
+
failures = 0
|
|
62
|
+
if not return_all_pages:
|
|
63
|
+
while True:
|
|
64
|
+
try:
|
|
65
|
+
job_response = await self.get(job_id)
|
|
66
|
+
return job_response
|
|
67
|
+
except Exception as e:
|
|
68
|
+
failures += 1
|
|
69
|
+
if failures >= POLLING_ATTEMPTS:
|
|
70
|
+
raise HyperbrowserError(
|
|
71
|
+
f"Failed to get crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
72
|
+
)
|
|
73
|
+
await asyncio.sleep(0.5)
|
|
74
|
+
|
|
75
|
+
failures = 0
|
|
76
|
+
job_response.current_page_batch = 0
|
|
77
|
+
job_response.data = []
|
|
78
|
+
while job_response.current_page_batch < job_response.total_page_batches:
|
|
79
|
+
try:
|
|
80
|
+
tmp_job_response = await self.get(
|
|
81
|
+
job_start_resp.job_id,
|
|
82
|
+
GetCrawlJobParams(
|
|
83
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
if tmp_job_response.data:
|
|
87
|
+
job_response.data.extend(tmp_job_response.data)
|
|
88
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
89
|
+
job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
|
|
90
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
91
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
92
|
+
failures = 0
|
|
93
|
+
except Exception as e:
|
|
94
|
+
failures += 1
|
|
95
|
+
if failures >= POLLING_ATTEMPTS:
|
|
96
|
+
raise HyperbrowserError(
|
|
97
|
+
f"Failed to get crawl batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
98
|
+
)
|
|
99
|
+
await asyncio.sleep(0.5)
|
|
100
|
+
|
|
101
|
+
return job_response
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/extract.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from hyperbrowser.exceptions import HyperbrowserError
|
|
3
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
4
|
from hyperbrowser.models.extract import (
|
|
4
5
|
ExtractJobResponse,
|
|
5
6
|
StartExtractJobParams,
|
|
@@ -32,10 +33,24 @@ class ExtractManager:
|
|
|
32
33
|
|
|
33
34
|
async def start_and_wait(self, params: StartExtractJobParams) -> ExtractJobResponse:
|
|
34
35
|
job_start_resp = await self.start(params)
|
|
35
|
-
|
|
36
|
+
job_id = job_start_resp.job_id
|
|
37
|
+
if not job_id:
|
|
36
38
|
raise HyperbrowserError("Failed to start extract job")
|
|
39
|
+
|
|
40
|
+
failures = 0
|
|
37
41
|
while True:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
try:
|
|
43
|
+
job_response = await self.get(job_id)
|
|
44
|
+
if (
|
|
45
|
+
job_response.status == "completed"
|
|
46
|
+
or job_response.status == "failed"
|
|
47
|
+
):
|
|
48
|
+
return job_response
|
|
49
|
+
failures = 0
|
|
50
|
+
except Exception as e:
|
|
51
|
+
failures += 1
|
|
52
|
+
if failures >= POLLING_ATTEMPTS:
|
|
53
|
+
raise HyperbrowserError(
|
|
54
|
+
f"Failed to poll extract job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
55
|
+
)
|
|
41
56
|
await asyncio.sleep(2)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
5
|
+
from ....models.scrape import (
|
|
6
|
+
BatchScrapeJobResponse,
|
|
7
|
+
GetBatchScrapeJobParams,
|
|
8
|
+
ScrapeJobResponse,
|
|
9
|
+
StartBatchScrapeJobParams,
|
|
10
|
+
StartBatchScrapeJobResponse,
|
|
11
|
+
StartScrapeJobParams,
|
|
12
|
+
StartScrapeJobResponse,
|
|
13
|
+
)
|
|
14
|
+
from ....exceptions import HyperbrowserError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BatchScrapeManager:
|
|
18
|
+
def __init__(self, client):
|
|
19
|
+
self._client = client
|
|
20
|
+
|
|
21
|
+
async def start(
|
|
22
|
+
self, params: StartBatchScrapeJobParams
|
|
23
|
+
) -> StartBatchScrapeJobResponse:
|
|
24
|
+
response = await self._client.transport.post(
|
|
25
|
+
self._client._build_url("/scrape/batch"),
|
|
26
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
27
|
+
)
|
|
28
|
+
return StartBatchScrapeJobResponse(**response.data)
|
|
29
|
+
|
|
30
|
+
async def get(
|
|
31
|
+
self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
|
|
32
|
+
) -> BatchScrapeJobResponse:
|
|
33
|
+
response = await self._client.transport.get(
|
|
34
|
+
self._client._build_url(f"/scrape/batch/{job_id}"), params=params.__dict__
|
|
35
|
+
)
|
|
36
|
+
return BatchScrapeJobResponse(**response.data)
|
|
37
|
+
|
|
38
|
+
async def start_and_wait(
|
|
39
|
+
self, params: StartBatchScrapeJobParams, return_all_pages: bool = True
|
|
40
|
+
) -> BatchScrapeJobResponse:
|
|
41
|
+
job_start_resp = await self.start(params)
|
|
42
|
+
job_id = job_start_resp.job_id
|
|
43
|
+
if not job_id:
|
|
44
|
+
raise HyperbrowserError("Failed to start batch scrape job")
|
|
45
|
+
|
|
46
|
+
job_response: BatchScrapeJobResponse
|
|
47
|
+
failures = 0
|
|
48
|
+
while True:
|
|
49
|
+
try:
|
|
50
|
+
job_response = await self.get(
|
|
51
|
+
job_id, params=GetBatchScrapeJobParams(batch_size=1)
|
|
52
|
+
)
|
|
53
|
+
if (
|
|
54
|
+
job_response.status == "completed"
|
|
55
|
+
or job_response.status == "failed"
|
|
56
|
+
):
|
|
57
|
+
break
|
|
58
|
+
failures = 0
|
|
59
|
+
except Exception as e:
|
|
60
|
+
failures += 1
|
|
61
|
+
if failures >= POLLING_ATTEMPTS:
|
|
62
|
+
raise HyperbrowserError(
|
|
63
|
+
f"Failed to poll batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
64
|
+
)
|
|
65
|
+
await asyncio.sleep(2)
|
|
66
|
+
|
|
67
|
+
failures = 0
|
|
68
|
+
if not return_all_pages:
|
|
69
|
+
while True:
|
|
70
|
+
try:
|
|
71
|
+
job_response = await self.get(job_id)
|
|
72
|
+
return job_response
|
|
73
|
+
except Exception as e:
|
|
74
|
+
failures += 1
|
|
75
|
+
if failures >= POLLING_ATTEMPTS:
|
|
76
|
+
raise HyperbrowserError(
|
|
77
|
+
f"Failed to get batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
78
|
+
)
|
|
79
|
+
await asyncio.sleep(0.5)
|
|
80
|
+
|
|
81
|
+
failures = 0
|
|
82
|
+
job_response.current_page_batch = 0
|
|
83
|
+
job_response.data = []
|
|
84
|
+
while job_response.current_page_batch < job_response.total_page_batches:
|
|
85
|
+
try:
|
|
86
|
+
tmp_job_response = await self.get(
|
|
87
|
+
job_id,
|
|
88
|
+
params=GetBatchScrapeJobParams(
|
|
89
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
if tmp_job_response.data:
|
|
93
|
+
job_response.data.extend(tmp_job_response.data)
|
|
94
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
95
|
+
job_response.total_scraped_pages = tmp_job_response.total_scraped_pages
|
|
96
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
97
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
98
|
+
failures = 0
|
|
99
|
+
except Exception as e:
|
|
100
|
+
failures += 1
|
|
101
|
+
if failures >= POLLING_ATTEMPTS:
|
|
102
|
+
raise HyperbrowserError(
|
|
103
|
+
f"Failed to get batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
104
|
+
)
|
|
105
|
+
await asyncio.sleep(0.5)
|
|
106
|
+
|
|
107
|
+
return job_response
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ScrapeManager:
|
|
111
|
+
def __init__(self, client):
|
|
112
|
+
self._client = client
|
|
113
|
+
self.batch = BatchScrapeManager(client)
|
|
114
|
+
|
|
115
|
+
async def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
|
|
116
|
+
response = await self._client.transport.post(
|
|
117
|
+
self._client._build_url("/scrape"),
|
|
118
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
119
|
+
)
|
|
120
|
+
return StartScrapeJobResponse(**response.data)
|
|
121
|
+
|
|
122
|
+
async def get(self, job_id: str) -> ScrapeJobResponse:
|
|
123
|
+
response = await self._client.transport.get(
|
|
124
|
+
self._client._build_url(f"/scrape/{job_id}")
|
|
125
|
+
)
|
|
126
|
+
return ScrapeJobResponse(**response.data)
|
|
127
|
+
|
|
128
|
+
async def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
|
|
129
|
+
job_start_resp = await self.start(params)
|
|
130
|
+
job_id = job_start_resp.job_id
|
|
131
|
+
if not job_id:
|
|
132
|
+
raise HyperbrowserError("Failed to start scrape job")
|
|
133
|
+
|
|
134
|
+
failures = 0
|
|
135
|
+
while True:
|
|
136
|
+
try:
|
|
137
|
+
job_response = await self.get(job_id)
|
|
138
|
+
if (
|
|
139
|
+
job_response.status == "completed"
|
|
140
|
+
or job_response.status == "failed"
|
|
141
|
+
):
|
|
142
|
+
return job_response
|
|
143
|
+
failures = 0
|
|
144
|
+
except Exception as e:
|
|
145
|
+
failures += 1
|
|
146
|
+
if failures >= POLLING_ATTEMPTS:
|
|
147
|
+
raise HyperbrowserError(
|
|
148
|
+
f"Failed to poll scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
149
|
+
)
|
|
150
|
+
await asyncio.sleep(2)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
5
|
+
from ....models.crawl import (
|
|
6
|
+
CrawlJobResponse,
|
|
7
|
+
GetCrawlJobParams,
|
|
8
|
+
StartCrawlJobParams,
|
|
9
|
+
StartCrawlJobResponse,
|
|
10
|
+
)
|
|
11
|
+
from ....exceptions import HyperbrowserError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CrawlManager:
|
|
15
|
+
def __init__(self, client):
|
|
16
|
+
self._client = client
|
|
17
|
+
|
|
18
|
+
def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
|
|
19
|
+
response = self._client.transport.post(
|
|
20
|
+
self._client._build_url("/crawl"),
|
|
21
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
22
|
+
)
|
|
23
|
+
return StartCrawlJobResponse(**response.data)
|
|
24
|
+
|
|
25
|
+
def get(
|
|
26
|
+
self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
|
|
27
|
+
) -> CrawlJobResponse:
|
|
28
|
+
response = self._client.transport.get(
|
|
29
|
+
self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
|
|
30
|
+
)
|
|
31
|
+
return CrawlJobResponse(**response.data)
|
|
32
|
+
|
|
33
|
+
def start_and_wait(
|
|
34
|
+
self, params: StartCrawlJobParams, return_all_pages: bool = True
|
|
35
|
+
) -> CrawlJobResponse:
|
|
36
|
+
job_start_resp = self.start(params)
|
|
37
|
+
job_id = job_start_resp.job_id
|
|
38
|
+
if not job_id:
|
|
39
|
+
raise HyperbrowserError("Failed to start crawl job")
|
|
40
|
+
|
|
41
|
+
job_response: CrawlJobResponse
|
|
42
|
+
failures = 0
|
|
43
|
+
while True:
|
|
44
|
+
try:
|
|
45
|
+
job_response = self.get(
|
|
46
|
+
job_id,
|
|
47
|
+
params=GetCrawlJobParams(batch_size=1),
|
|
48
|
+
)
|
|
49
|
+
if (
|
|
50
|
+
job_response.status == "completed"
|
|
51
|
+
or job_response.status == "failed"
|
|
52
|
+
):
|
|
53
|
+
break
|
|
54
|
+
except Exception as e:
|
|
55
|
+
failures += 1
|
|
56
|
+
if failures >= POLLING_ATTEMPTS:
|
|
57
|
+
raise HyperbrowserError(
|
|
58
|
+
f"Failed to poll crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
59
|
+
)
|
|
60
|
+
time.sleep(2)
|
|
61
|
+
|
|
62
|
+
failures = 0
|
|
63
|
+
if not return_all_pages:
|
|
64
|
+
while True:
|
|
65
|
+
try:
|
|
66
|
+
job_response = self.get(job_id)
|
|
67
|
+
return job_response
|
|
68
|
+
except Exception as e:
|
|
69
|
+
failures += 1
|
|
70
|
+
if failures >= POLLING_ATTEMPTS:
|
|
71
|
+
raise HyperbrowserError(
|
|
72
|
+
f"Failed to get crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
73
|
+
)
|
|
74
|
+
time.sleep(0.5)
|
|
75
|
+
|
|
76
|
+
failures = 0
|
|
77
|
+
job_response.current_page_batch = 0
|
|
78
|
+
job_response.data = []
|
|
79
|
+
while job_response.current_page_batch < job_response.total_page_batches:
|
|
80
|
+
try:
|
|
81
|
+
tmp_job_response = self.get(
|
|
82
|
+
job_id,
|
|
83
|
+
GetCrawlJobParams(
|
|
84
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
if tmp_job_response.data:
|
|
88
|
+
job_response.data.extend(tmp_job_response.data)
|
|
89
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
90
|
+
job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
|
|
91
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
92
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
93
|
+
failures = 0
|
|
94
|
+
except Exception as e:
|
|
95
|
+
failures += 1
|
|
96
|
+
if failures >= POLLING_ATTEMPTS:
|
|
97
|
+
raise HyperbrowserError(
|
|
98
|
+
f"Failed to get crawl batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
99
|
+
)
|
|
100
|
+
time.sleep(0.5)
|
|
101
|
+
|
|
102
|
+
return job_response
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/extract.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from hyperbrowser.exceptions import HyperbrowserError
|
|
3
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
4
|
from hyperbrowser.models.extract import (
|
|
4
5
|
ExtractJobResponse,
|
|
5
6
|
StartExtractJobParams,
|
|
@@ -32,10 +33,23 @@ class ExtractManager:
|
|
|
32
33
|
|
|
33
34
|
def start_and_wait(self, params: StartExtractJobParams) -> ExtractJobResponse:
|
|
34
35
|
job_start_resp = self.start(params)
|
|
35
|
-
|
|
36
|
+
job_id = job_start_resp.job_id
|
|
37
|
+
if not job_id:
|
|
36
38
|
raise HyperbrowserError("Failed to start extract job")
|
|
39
|
+
|
|
40
|
+
failures = 0
|
|
37
41
|
while True:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
try:
|
|
43
|
+
job_response = self.get(job_start_resp.job_id)
|
|
44
|
+
if (
|
|
45
|
+
job_response.status == "completed"
|
|
46
|
+
or job_response.status == "failed"
|
|
47
|
+
):
|
|
48
|
+
return job_response
|
|
49
|
+
except Exception as e:
|
|
50
|
+
failures += 1
|
|
51
|
+
if failures >= POLLING_ATTEMPTS:
|
|
52
|
+
raise HyperbrowserError(
|
|
53
|
+
f"Failed to poll extract job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
54
|
+
)
|
|
41
55
|
time.sleep(2)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
5
|
+
from ....models.scrape import (
|
|
6
|
+
BatchScrapeJobResponse,
|
|
7
|
+
GetBatchScrapeJobParams,
|
|
8
|
+
ScrapeJobResponse,
|
|
9
|
+
StartBatchScrapeJobParams,
|
|
10
|
+
StartBatchScrapeJobResponse,
|
|
11
|
+
StartScrapeJobParams,
|
|
12
|
+
StartScrapeJobResponse,
|
|
13
|
+
)
|
|
14
|
+
from ....exceptions import HyperbrowserError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BatchScrapeManager:
|
|
18
|
+
def __init__(self, client):
|
|
19
|
+
self._client = client
|
|
20
|
+
|
|
21
|
+
def start(self, params: StartBatchScrapeJobParams) -> StartBatchScrapeJobResponse:
|
|
22
|
+
response = self._client.transport.post(
|
|
23
|
+
self._client._build_url("/scrape/batch"),
|
|
24
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
25
|
+
)
|
|
26
|
+
return StartBatchScrapeJobResponse(**response.data)
|
|
27
|
+
|
|
28
|
+
def get(
|
|
29
|
+
self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
|
|
30
|
+
) -> BatchScrapeJobResponse:
|
|
31
|
+
response = self._client.transport.get(
|
|
32
|
+
self._client._build_url(f"/scrape/batch/{job_id}"), params=params.__dict__
|
|
33
|
+
)
|
|
34
|
+
return BatchScrapeJobResponse(**response.data)
|
|
35
|
+
|
|
36
|
+
def start_and_wait(
|
|
37
|
+
self, params: StartBatchScrapeJobParams, return_all_pages: bool = True
|
|
38
|
+
) -> BatchScrapeJobResponse:
|
|
39
|
+
job_start_resp = self.start(params)
|
|
40
|
+
job_id = job_start_resp.job_id
|
|
41
|
+
if not job_id:
|
|
42
|
+
raise HyperbrowserError("Failed to start batch scrape job")
|
|
43
|
+
|
|
44
|
+
job_response: BatchScrapeJobResponse
|
|
45
|
+
failures = 0
|
|
46
|
+
while True:
|
|
47
|
+
try:
|
|
48
|
+
job_response = self.get(
|
|
49
|
+
job_id, params=GetBatchScrapeJobParams(batch_size=1)
|
|
50
|
+
)
|
|
51
|
+
if (
|
|
52
|
+
job_response.status == "completed"
|
|
53
|
+
or job_response.status == "failed"
|
|
54
|
+
):
|
|
55
|
+
break
|
|
56
|
+
failures = 0
|
|
57
|
+
except Exception as e:
|
|
58
|
+
failures += 1
|
|
59
|
+
if failures >= POLLING_ATTEMPTS:
|
|
60
|
+
raise HyperbrowserError(
|
|
61
|
+
f"Failed to poll batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
62
|
+
)
|
|
63
|
+
time.sleep(2)
|
|
64
|
+
|
|
65
|
+
failures = 0
|
|
66
|
+
if not return_all_pages:
|
|
67
|
+
while True:
|
|
68
|
+
try:
|
|
69
|
+
job_response = self.get(job_id)
|
|
70
|
+
return job_response
|
|
71
|
+
except Exception as e:
|
|
72
|
+
failures += 1
|
|
73
|
+
if failures >= POLLING_ATTEMPTS:
|
|
74
|
+
raise HyperbrowserError(
|
|
75
|
+
f"Failed to get batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
76
|
+
)
|
|
77
|
+
time.sleep(0.5)
|
|
78
|
+
|
|
79
|
+
failures = 0
|
|
80
|
+
job_response.current_page_batch = 0
|
|
81
|
+
job_response.data = []
|
|
82
|
+
while job_response.current_page_batch < job_response.total_page_batches:
|
|
83
|
+
try:
|
|
84
|
+
tmp_job_response = self.get(
|
|
85
|
+
job_start_resp.job_id,
|
|
86
|
+
GetBatchScrapeJobParams(
|
|
87
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
if tmp_job_response.data:
|
|
91
|
+
job_response.data.extend(tmp_job_response.data)
|
|
92
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
93
|
+
job_response.total_scraped_pages = tmp_job_response.total_scraped_pages
|
|
94
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
95
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
96
|
+
failures = 0
|
|
97
|
+
except Exception as e:
|
|
98
|
+
failures += 1
|
|
99
|
+
if failures >= POLLING_ATTEMPTS:
|
|
100
|
+
raise HyperbrowserError(
|
|
101
|
+
f"Failed to get batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
102
|
+
)
|
|
103
|
+
time.sleep(0.5)
|
|
104
|
+
|
|
105
|
+
return job_response
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class ScrapeManager:
|
|
109
|
+
def __init__(self, client):
|
|
110
|
+
self._client = client
|
|
111
|
+
self.batch = BatchScrapeManager(client)
|
|
112
|
+
|
|
113
|
+
def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
|
|
114
|
+
response = self._client.transport.post(
|
|
115
|
+
self._client._build_url("/scrape"),
|
|
116
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
117
|
+
)
|
|
118
|
+
return StartScrapeJobResponse(**response.data)
|
|
119
|
+
|
|
120
|
+
def get(self, job_id: str) -> ScrapeJobResponse:
|
|
121
|
+
response = self._client.transport.get(
|
|
122
|
+
self._client._build_url(f"/scrape/{job_id}")
|
|
123
|
+
)
|
|
124
|
+
return ScrapeJobResponse(**response.data)
|
|
125
|
+
|
|
126
|
+
def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
|
|
127
|
+
job_start_resp = self.start(params)
|
|
128
|
+
job_id = job_start_resp.job_id
|
|
129
|
+
if not job_id:
|
|
130
|
+
raise HyperbrowserError("Failed to start scrape job")
|
|
131
|
+
|
|
132
|
+
failures = 0
|
|
133
|
+
while True:
|
|
134
|
+
try:
|
|
135
|
+
job_response = self.get(job_id)
|
|
136
|
+
if (
|
|
137
|
+
job_response.status == "completed"
|
|
138
|
+
or job_response.status == "failed"
|
|
139
|
+
):
|
|
140
|
+
return job_response
|
|
141
|
+
failures = 0
|
|
142
|
+
except Exception as e:
|
|
143
|
+
failures += 1
|
|
144
|
+
if failures >= POLLING_ATTEMPTS:
|
|
145
|
+
raise HyperbrowserError(
|
|
146
|
+
f"Failed to poll scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
147
|
+
)
|
|
148
|
+
time.sleep(2)
|
|
@@ -2,6 +2,8 @@ from typing import Literal
|
|
|
2
2
|
|
|
3
3
|
ScrapeFormat = Literal["markdown", "html", "links", "screenshot"]
|
|
4
4
|
ScrapeWaitUntil = Literal["load", "domcontentloaded", "networkidle"]
|
|
5
|
+
ScrapePageStatus = Literal["completed", "failed", "pending", "running"]
|
|
6
|
+
POLLING_ATTEMPTS = 5
|
|
5
7
|
|
|
6
8
|
Country = Literal[
|
|
7
9
|
"AD",
|
|
@@ -18,7 +18,7 @@ class StartCrawlJobParams(BaseModel):
|
|
|
18
18
|
)
|
|
19
19
|
|
|
20
20
|
url: str
|
|
21
|
-
max_pages: int = Field(default=
|
|
21
|
+
max_pages: Optional[int] = Field(default=None, ge=1, serialization_alias="maxPages")
|
|
22
22
|
follow_links: bool = Field(default=True, serialization_alias="followLinks")
|
|
23
23
|
ignore_sitemap: bool = Field(default=False, serialization_alias="ignoreSitemap")
|
|
24
24
|
exclude_patterns: List[str] = Field(
|
|
@@ -69,7 +69,7 @@ class GetCrawlJobParams(BaseModel):
|
|
|
69
69
|
|
|
70
70
|
page: Optional[int] = Field(default=None, serialization_alias="page")
|
|
71
71
|
batch_size: Optional[int] = Field(
|
|
72
|
-
default=
|
|
72
|
+
default=None, ge=1, serialization_alias="batchSize"
|
|
73
73
|
)
|
|
74
74
|
|
|
75
75
|
|
|
@@ -17,13 +17,17 @@ class StartExtractJobParams(BaseModel):
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
urls: List[str]
|
|
20
|
-
|
|
20
|
+
system_prompt: Optional[str] = Field(
|
|
21
|
+
default=None, serialization_alias="systemPrompt"
|
|
22
|
+
)
|
|
23
|
+
prompt: Optional[str] = Field(default=None, serialization_alias="prompt")
|
|
21
24
|
schema_: Optional[Any] = pydantic.Field(
|
|
22
25
|
None, alias="schema", serialization_alias="schema"
|
|
23
26
|
)
|
|
24
27
|
session_options: Optional[CreateSessionParams] = Field(
|
|
25
28
|
default=None, serialization_alias="sessionOptions"
|
|
26
29
|
)
|
|
30
|
+
max_links: Optional[int] = Field(default=None, serialization_alias="maxLinks")
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class StartExtractJobResponse(BaseModel):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Literal, Optional, Union
|
|
2
2
|
from pydantic import BaseModel, ConfigDict, Field
|
|
3
3
|
|
|
4
|
-
from hyperbrowser.models.consts import ScrapeFormat, ScrapeWaitUntil
|
|
4
|
+
from hyperbrowser.models.consts import ScrapeFormat, ScrapePageStatus, ScrapeWaitUntil
|
|
5
5
|
from hyperbrowser.models.session import CreateSessionParams
|
|
6
6
|
|
|
7
7
|
ScrapeJobStatus = Literal["pending", "running", "completed", "failed"]
|
|
@@ -84,3 +84,78 @@ class ScrapeJobResponse(BaseModel):
|
|
|
84
84
|
status: ScrapeJobStatus
|
|
85
85
|
error: Optional[str] = None
|
|
86
86
|
data: Optional[ScrapeJobData] = None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class StartBatchScrapeJobParams(BaseModel):
|
|
90
|
+
"""
|
|
91
|
+
Parameters for creating a new batch scrape job.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
urls: List[str]
|
|
95
|
+
session_options: Optional[CreateSessionParams] = Field(
|
|
96
|
+
default=None, serialization_alias="sessionOptions"
|
|
97
|
+
)
|
|
98
|
+
scrape_options: Optional[ScrapeOptions] = Field(
|
|
99
|
+
default=None, serialization_alias="scrapeOptions"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class ScrapedPage(BaseModel):
|
|
104
|
+
"""
|
|
105
|
+
A scraped page.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
model_config = ConfigDict(
|
|
109
|
+
populate_by_alias=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
url: str
|
|
113
|
+
status: ScrapePageStatus
|
|
114
|
+
error: Optional[str] = None
|
|
115
|
+
metadata: Optional[dict[str, Union[str, list[str]]]] = None
|
|
116
|
+
html: Optional[str] = None
|
|
117
|
+
markdown: Optional[str] = None
|
|
118
|
+
links: Optional[List[str]] = None
|
|
119
|
+
screenshot: Optional[str] = None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class GetBatchScrapeJobParams(BaseModel):
|
|
123
|
+
"""
|
|
124
|
+
Parameters for getting a batch scrape job.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
page: Optional[int] = Field(default=None, serialization_alias="page")
|
|
128
|
+
batch_size: Optional[int] = Field(
|
|
129
|
+
default=None, ge=1, serialization_alias="batchSize"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class StartBatchScrapeJobResponse(BaseModel):
|
|
134
|
+
"""
|
|
135
|
+
Response from starting a batch scrape job.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
model_config = ConfigDict(
|
|
139
|
+
populate_by_alias=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
job_id: str = Field(alias="jobId")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class BatchScrapeJobResponse(BaseModel):
|
|
146
|
+
"""
|
|
147
|
+
Response from getting a batch scrape job.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
model_config = ConfigDict(
|
|
151
|
+
populate_by_alias=True,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
job_id: str = Field(alias="jobId")
|
|
155
|
+
status: ScrapeJobStatus
|
|
156
|
+
error: Optional[str] = None
|
|
157
|
+
data: Optional[List[ScrapedPage]] = Field(alias="data")
|
|
158
|
+
total_scraped_pages: int = Field(alias="totalScrapedPages")
|
|
159
|
+
total_page_batches: int = Field(alias="totalPageBatches")
|
|
160
|
+
current_page_batch: int = Field(alias="currentPageBatch")
|
|
161
|
+
batch_size: int = Field(alias="batchSize")
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
from typing import Optional
|
|
3
|
-
from ....models.crawl import (
|
|
4
|
-
CrawlJobResponse,
|
|
5
|
-
GetCrawlJobParams,
|
|
6
|
-
StartCrawlJobParams,
|
|
7
|
-
StartCrawlJobResponse,
|
|
8
|
-
)
|
|
9
|
-
from ....exceptions import HyperbrowserError
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class CrawlManager:
|
|
13
|
-
def __init__(self, client):
|
|
14
|
-
self._client = client
|
|
15
|
-
|
|
16
|
-
async def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
|
|
17
|
-
response = await self._client.transport.post(
|
|
18
|
-
self._client._build_url("/crawl"),
|
|
19
|
-
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
20
|
-
)
|
|
21
|
-
return StartCrawlJobResponse(**response.data)
|
|
22
|
-
|
|
23
|
-
async def get(
|
|
24
|
-
self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
|
|
25
|
-
) -> CrawlJobResponse:
|
|
26
|
-
response = await self._client.transport.get(
|
|
27
|
-
self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
|
|
28
|
-
)
|
|
29
|
-
return CrawlJobResponse(**response.data)
|
|
30
|
-
|
|
31
|
-
async def start_and_wait(
|
|
32
|
-
self, params: StartCrawlJobParams, return_all_pages: bool = True
|
|
33
|
-
) -> CrawlJobResponse:
|
|
34
|
-
job_start_resp = await self.start(params)
|
|
35
|
-
if not job_start_resp.job_id:
|
|
36
|
-
raise HyperbrowserError("Failed to start crawl job")
|
|
37
|
-
|
|
38
|
-
job_response: CrawlJobResponse
|
|
39
|
-
while True:
|
|
40
|
-
job_response = await self.get(job_start_resp.job_id)
|
|
41
|
-
if job_response.status == "completed" or job_response.status == "failed":
|
|
42
|
-
break
|
|
43
|
-
await asyncio.sleep(2)
|
|
44
|
-
|
|
45
|
-
if not return_all_pages:
|
|
46
|
-
return job_response
|
|
47
|
-
|
|
48
|
-
while job_response.current_page_batch < job_response.total_page_batches:
|
|
49
|
-
tmp_job_response = await self.get(
|
|
50
|
-
job_start_resp.job_id,
|
|
51
|
-
GetCrawlJobParams(page=job_response.current_page_batch + 1),
|
|
52
|
-
)
|
|
53
|
-
if tmp_job_response.data:
|
|
54
|
-
job_response.data.extend(tmp_job_response.data)
|
|
55
|
-
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
56
|
-
job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
|
|
57
|
-
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
58
|
-
job_response.batch_size = tmp_job_response.batch_size
|
|
59
|
-
await asyncio.sleep(0.5)
|
|
60
|
-
return job_response
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
from typing import Optional
|
|
3
|
-
from ....models.scrape import (
|
|
4
|
-
ScrapeJobResponse,
|
|
5
|
-
StartScrapeJobParams,
|
|
6
|
-
StartScrapeJobResponse,
|
|
7
|
-
)
|
|
8
|
-
from ....exceptions import HyperbrowserError
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ScrapeManager:
|
|
12
|
-
def __init__(self, client):
|
|
13
|
-
self._client = client
|
|
14
|
-
|
|
15
|
-
async def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
|
|
16
|
-
response = await self._client.transport.post(
|
|
17
|
-
self._client._build_url("/scrape"),
|
|
18
|
-
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
19
|
-
)
|
|
20
|
-
return StartScrapeJobResponse(**response.data)
|
|
21
|
-
|
|
22
|
-
async def get(self, job_id: str) -> ScrapeJobResponse:
|
|
23
|
-
response = await self._client.transport.get(
|
|
24
|
-
self._client._build_url(f"/scrape/{job_id}")
|
|
25
|
-
)
|
|
26
|
-
return ScrapeJobResponse(**response.data)
|
|
27
|
-
|
|
28
|
-
async def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
|
|
29
|
-
job_start_resp = await self.start(params)
|
|
30
|
-
if not job_start_resp.job_id:
|
|
31
|
-
raise HyperbrowserError("Failed to start scrape job")
|
|
32
|
-
while True:
|
|
33
|
-
job_response = await self.get(job_start_resp.job_id)
|
|
34
|
-
if job_response.status == "completed" or job_response.status == "failed":
|
|
35
|
-
return job_response
|
|
36
|
-
await asyncio.sleep(2)
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from typing import Optional
|
|
3
|
-
from ....models.crawl import (
|
|
4
|
-
CrawlJobResponse,
|
|
5
|
-
GetCrawlJobParams,
|
|
6
|
-
StartCrawlJobParams,
|
|
7
|
-
StartCrawlJobResponse,
|
|
8
|
-
)
|
|
9
|
-
from ....exceptions import HyperbrowserError
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class CrawlManager:
|
|
13
|
-
def __init__(self, client):
|
|
14
|
-
self._client = client
|
|
15
|
-
|
|
16
|
-
def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
|
|
17
|
-
response = self._client.transport.post(
|
|
18
|
-
self._client._build_url("/crawl"),
|
|
19
|
-
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
20
|
-
)
|
|
21
|
-
return StartCrawlJobResponse(**response.data)
|
|
22
|
-
|
|
23
|
-
def get(
|
|
24
|
-
self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
|
|
25
|
-
) -> CrawlJobResponse:
|
|
26
|
-
response = self._client.transport.get(
|
|
27
|
-
self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
|
|
28
|
-
)
|
|
29
|
-
return CrawlJobResponse(**response.data)
|
|
30
|
-
|
|
31
|
-
def start_and_wait(
|
|
32
|
-
self, params: StartCrawlJobParams, return_all_pages: bool = True
|
|
33
|
-
) -> CrawlJobResponse:
|
|
34
|
-
job_start_resp = self.start(params)
|
|
35
|
-
if not job_start_resp.job_id:
|
|
36
|
-
raise HyperbrowserError("Failed to start crawl job")
|
|
37
|
-
|
|
38
|
-
job_response: CrawlJobResponse
|
|
39
|
-
while True:
|
|
40
|
-
job_response = self.get(job_start_resp.job_id)
|
|
41
|
-
if job_response.status == "completed" or job_response.status == "failed":
|
|
42
|
-
break
|
|
43
|
-
time.sleep(2)
|
|
44
|
-
|
|
45
|
-
if not return_all_pages:
|
|
46
|
-
return job_response
|
|
47
|
-
|
|
48
|
-
while job_response.current_page_batch < job_response.total_page_batches:
|
|
49
|
-
tmp_job_response = self.get(
|
|
50
|
-
job_start_resp.job_id,
|
|
51
|
-
GetCrawlJobParams(page=job_response.current_page_batch + 1),
|
|
52
|
-
)
|
|
53
|
-
if tmp_job_response.data:
|
|
54
|
-
job_response.data.extend(tmp_job_response.data)
|
|
55
|
-
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
56
|
-
job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
|
|
57
|
-
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
58
|
-
job_response.batch_size = tmp_job_response.batch_size
|
|
59
|
-
time.sleep(0.5)
|
|
60
|
-
return job_response
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from typing import Optional
|
|
3
|
-
from ....models.scrape import (
|
|
4
|
-
ScrapeJobResponse,
|
|
5
|
-
StartScrapeJobParams,
|
|
6
|
-
StartScrapeJobResponse,
|
|
7
|
-
)
|
|
8
|
-
from ....exceptions import HyperbrowserError
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ScrapeManager:
|
|
12
|
-
def __init__(self, client):
|
|
13
|
-
self._client = client
|
|
14
|
-
|
|
15
|
-
def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
|
|
16
|
-
response = self._client.transport.post(
|
|
17
|
-
self._client._build_url("/scrape"),
|
|
18
|
-
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
19
|
-
)
|
|
20
|
-
return StartScrapeJobResponse(**response.data)
|
|
21
|
-
|
|
22
|
-
def get(self, job_id: str) -> ScrapeJobResponse:
|
|
23
|
-
response = self._client.transport.get(
|
|
24
|
-
self._client._build_url(f"/scrape/{job_id}")
|
|
25
|
-
)
|
|
26
|
-
return ScrapeJobResponse(**response.data)
|
|
27
|
-
|
|
28
|
-
def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
|
|
29
|
-
job_start_resp = self.start(params)
|
|
30
|
-
if not job_start_resp.job_id:
|
|
31
|
-
raise HyperbrowserError("Failed to start scrape job")
|
|
32
|
-
while True:
|
|
33
|
-
job_response = self.get(job_start_resp.job_id)
|
|
34
|
-
if job_response.status == "completed" or job_response.status == "failed":
|
|
35
|
-
return job_response
|
|
36
|
-
time.sleep(2)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/extension.py
RENAMED
|
File without changes
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/profile.py
RENAMED
|
File without changes
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/session.py
RENAMED
|
File without changes
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/extension.py
RENAMED
|
File without changes
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/profile.py
RENAMED
|
File without changes
|
{hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/session.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|