fraudcrawler 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +7 -5
- fraudcrawler/base/base.py +64 -32
- fraudcrawler/base/client.py +27 -11
- fraudcrawler/base/orchestrator.py +103 -25
- fraudcrawler/base/retry.py +5 -2
- fraudcrawler/launch_demo_pipeline.py +9 -9
- fraudcrawler/processing/processor.py +9 -5
- fraudcrawler/scraping/enrich.py +38 -21
- fraudcrawler/scraping/search.py +664 -0
- fraudcrawler/scraping/zyte.py +37 -15
- fraudcrawler/settings.py +13 -2
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/METADATA +6 -2
- fraudcrawler-0.5.1.dist-info/RECORD +22 -0
- fraudcrawler/scraping/serp.py +0 -515
- fraudcrawler-0.4.7.dist-info/RECORD +0 -22
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.4.7.dist-info → fraudcrawler-0.5.1.dist-info}/entry_points.txt +0 -0
fraudcrawler/scraping/zyte.py
CHANGED
|
@@ -2,17 +2,17 @@ import logging
|
|
|
2
2
|
from typing import List
|
|
3
3
|
from base64 import b64decode
|
|
4
4
|
|
|
5
|
-
import
|
|
5
|
+
import httpx
|
|
6
6
|
from tenacity import RetryCallState
|
|
7
7
|
|
|
8
8
|
from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
|
|
9
|
-
from fraudcrawler.base.base import
|
|
9
|
+
from fraudcrawler.base.base import DomainUtils
|
|
10
10
|
from fraudcrawler.base.retry import get_async_retry
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class
|
|
15
|
+
class ZyteAPI(DomainUtils):
|
|
16
16
|
"""A client to interact with the Zyte API for fetching product details."""
|
|
17
17
|
|
|
18
18
|
_endpoint = "https://api.zyte.com/v1/extract"
|
|
@@ -30,14 +30,17 @@ class ZyteApi(AsyncClient):
|
|
|
30
30
|
|
|
31
31
|
def __init__(
|
|
32
32
|
self,
|
|
33
|
+
http_client: httpx.AsyncClient,
|
|
33
34
|
api_key: str,
|
|
34
35
|
):
|
|
35
36
|
"""Initializes the ZyteApiClient with the given API key and retry configurations.
|
|
36
37
|
|
|
37
38
|
Args:
|
|
39
|
+
http_client: An httpx.AsyncClient to use for the async requests.
|
|
38
40
|
api_key: The API key for Zyte API.
|
|
39
41
|
"""
|
|
40
|
-
self.
|
|
42
|
+
self._http_client = http_client
|
|
43
|
+
self._api_key = api_key
|
|
41
44
|
|
|
42
45
|
def _log_before(self, url: str, retry_state: RetryCallState | None) -> None:
|
|
43
46
|
"""Context aware logging before the request is made."""
|
|
@@ -58,7 +61,7 @@ class ZyteApi(AsyncClient):
|
|
|
58
61
|
else:
|
|
59
62
|
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
60
63
|
|
|
61
|
-
async def
|
|
64
|
+
async def details(self, url: str) -> dict:
|
|
62
65
|
"""Fetches product details for a single URL.
|
|
63
66
|
|
|
64
67
|
Args:
|
|
@@ -97,16 +100,20 @@ class ZyteApi(AsyncClient):
|
|
|
97
100
|
)
|
|
98
101
|
async for attempt in retry:
|
|
99
102
|
with attempt:
|
|
100
|
-
|
|
103
|
+
response = await self._http_client.post(
|
|
101
104
|
url=self._endpoint,
|
|
102
|
-
|
|
103
|
-
auth=self.
|
|
105
|
+
json={"url": url, **self._config},
|
|
106
|
+
auth=(self._api_key, ""), # API key as username, empty password
|
|
104
107
|
)
|
|
105
|
-
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
|
|
110
|
+
details = response.json()
|
|
111
|
+
return details
|
|
106
112
|
|
|
107
113
|
@staticmethod
|
|
108
114
|
def keep_product(
|
|
109
|
-
details: dict,
|
|
115
|
+
details: dict,
|
|
116
|
+
threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
110
117
|
) -> bool:
|
|
111
118
|
"""Determines whether to keep the product based on the probability threshold.
|
|
112
119
|
|
|
@@ -136,6 +143,19 @@ class ZyteApi(AsyncClient):
|
|
|
136
143
|
"""
|
|
137
144
|
return details.get("product", {}).get("name")
|
|
138
145
|
|
|
146
|
+
@staticmethod
|
|
147
|
+
def extract_url_resolved(details: dict) -> str | None:
|
|
148
|
+
"""Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
|
|
149
|
+
|
|
150
|
+
The input argument is a dictionary of the following structure:
|
|
151
|
+
{
|
|
152
|
+
"product": {
|
|
153
|
+
"url": str,
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
"""
|
|
157
|
+
return details.get("product", {}).get("url")
|
|
158
|
+
|
|
139
159
|
@staticmethod
|
|
140
160
|
def extract_product_price(details: dict) -> str | None:
|
|
141
161
|
"""Extracts the product price from the product data.
|
|
@@ -198,7 +218,9 @@ class ZyteApi(AsyncClient):
|
|
|
198
218
|
}
|
|
199
219
|
}
|
|
200
220
|
"""
|
|
201
|
-
return float(
|
|
221
|
+
return float(
|
|
222
|
+
details.get("product", {}).get("metadata", {}).get("probability", 0.0)
|
|
223
|
+
)
|
|
202
224
|
|
|
203
225
|
@staticmethod
|
|
204
226
|
def extract_html(details: dict) -> str | None:
|
|
@@ -209,7 +231,6 @@ class ZyteApi(AsyncClient):
|
|
|
209
231
|
"httpResponseBody": base64
|
|
210
232
|
}
|
|
211
233
|
"""
|
|
212
|
-
|
|
213
234
|
# Get the Base64-encoded content
|
|
214
235
|
encoded = details.get("httpResponseBody")
|
|
215
236
|
|
|
@@ -217,6 +238,7 @@ class ZyteApi(AsyncClient):
|
|
|
217
238
|
if isinstance(encoded, str):
|
|
218
239
|
decoded_bytes = b64decode(encoded)
|
|
219
240
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
241
|
+
# Convert bytes to string (assuming UTF-8 encoding)
|
|
242
|
+
decoded_string = decoded_bytes.decode("utf-8")
|
|
243
|
+
return decoded_string
|
|
244
|
+
return None
|
fraudcrawler/settings.py
CHANGED
|
@@ -17,7 +17,7 @@ RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
|
|
|
17
17
|
# Serp settings
|
|
18
18
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
19
19
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
20
|
-
|
|
20
|
+
SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
21
21
|
# ".com",
|
|
22
22
|
]
|
|
23
23
|
|
|
@@ -75,7 +75,18 @@ PROCESSOR_EMPTY_TOKEN_COUNT = -1
|
|
|
75
75
|
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
76
76
|
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
77
77
|
|
|
78
|
-
# Async settings
|
|
78
|
+
# Async workers settings
|
|
79
79
|
DEFAULT_N_SERP_WKRS = 10
|
|
80
80
|
DEFAULT_N_ZYTE_WKRS = 10
|
|
81
81
|
DEFAULT_N_PROC_WKRS = 10
|
|
82
|
+
|
|
83
|
+
# HTTPX client settings
|
|
84
|
+
DEFAULT_HTTPX_TIMEOUT = {
|
|
85
|
+
"timeout": 600,
|
|
86
|
+
"connect": 5.0,
|
|
87
|
+
}
|
|
88
|
+
DEFAULT_HTTPX_LIMITS = {
|
|
89
|
+
"max_connections": 1000,
|
|
90
|
+
"max_keepalive_connections": 100,
|
|
91
|
+
}
|
|
92
|
+
DEFAULT_HTTPX_REDIRECTS = True
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,8 +11,8 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
15
14
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
15
|
+
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
16
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
17
17
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
18
|
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
@@ -158,6 +158,10 @@ client.print_available_results()
|
|
|
158
158
|
see `CONTRIBUTING.md`
|
|
159
159
|
|
|
160
160
|
### Async Setup
|
|
161
|
+
The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
|
|
162
|
+
|
|
163
|
+
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
164
|
+
|
|
161
165
|
The following image provides a schematic representation of the package's async setup.
|
|
162
166
|

|
|
163
167
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
|
|
2
|
+
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
fraudcrawler/base/base.py,sha256=suQMnvLIsZO_R0eHZKDWS4u9qnd1ryzPhjGlwcaMD5A,7295
|
|
4
|
+
fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
|
|
5
|
+
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
+
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=AKEETrYwKbMy_6YgTdgc6L-VA1iHYOtj3wIqEN3ngO4,26990
|
|
8
|
+
fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
|
|
9
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
|
|
10
|
+
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
|
|
12
|
+
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
+
fraudcrawler/scraping/search.py,sha256=nNkLgkF12AFS9A5vPs8yCb7irqxn6hiNWhS8Ku28oO0,24062
|
|
15
|
+
fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
|
|
16
|
+
fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
|
|
17
|
+
fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
|
|
18
|
+
fraudcrawler-0.5.1.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
+
fraudcrawler-0.5.1.dist-info/METADATA,sha256=QsNxcNbL02EmhHZTpusO5t4uleOW15lEmWc76Kv3BHQ,6642
|
|
20
|
+
fraudcrawler-0.5.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
+
fraudcrawler-0.5.1.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
+
fraudcrawler-0.5.1.dist-info/RECORD,,
|