fraudcrawler 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,17 +2,17 @@ import logging
2
2
  from typing import List
3
3
  from base64 import b64decode
4
4
 
5
- import aiohttp
5
+ import httpx
6
6
  from tenacity import RetryCallState
7
7
 
8
8
  from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
9
- from fraudcrawler.base.base import AsyncClient
9
+ from fraudcrawler.base.base import DomainUtils
10
10
  from fraudcrawler.base.retry import get_async_retry
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
- class ZyteApi(AsyncClient):
15
+ class ZyteAPI(DomainUtils):
16
16
  """A client to interact with the Zyte API for fetching product details."""
17
17
 
18
18
  _endpoint = "https://api.zyte.com/v1/extract"
@@ -30,14 +30,17 @@ class ZyteApi(AsyncClient):
30
30
 
31
31
  def __init__(
32
32
  self,
33
+ http_client: httpx.AsyncClient,
33
34
  api_key: str,
34
35
  ):
35
36
  """Initializes the ZyteApiClient with the given API key and retry configurations.
36
37
 
37
38
  Args:
39
+ http_client: An httpx.AsyncClient to use for the async requests.
38
40
  api_key: The API key for Zyte API.
39
41
  """
40
- self._aiohttp_basic_auth = aiohttp.BasicAuth(api_key)
42
+ self._http_client = http_client
43
+ self._api_key = api_key
41
44
 
42
45
  def _log_before(self, url: str, retry_state: RetryCallState | None) -> None:
43
46
  """Context aware logging before the request is made."""
@@ -58,7 +61,7 @@ class ZyteApi(AsyncClient):
58
61
  else:
59
62
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
60
63
 
61
- async def get_details(self, url: str) -> dict:
64
+ async def details(self, url: str) -> dict:
62
65
  """Fetches product details for a single URL.
63
66
 
64
67
  Args:
@@ -97,16 +100,20 @@ class ZyteApi(AsyncClient):
97
100
  )
98
101
  async for attempt in retry:
99
102
  with attempt:
100
- product = await self.post(
103
+ response = await self._http_client.post(
101
104
  url=self._endpoint,
102
- data={"url": url, **self._config},
103
- auth=self._aiohttp_basic_auth,
105
+ json={"url": url, **self._config},
106
+ auth=(self._api_key, ""), # API key as username, empty password
104
107
  )
105
- return product
108
+ response.raise_for_status()
109
+
110
+ details = response.json()
111
+ return details
106
112
 
107
113
  @staticmethod
108
114
  def keep_product(
109
- details: dict, threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD
115
+ details: dict,
116
+ threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
110
117
  ) -> bool:
111
118
  """Determines whether to keep the product based on the probability threshold.
112
119
 
@@ -136,6 +143,19 @@ class ZyteApi(AsyncClient):
136
143
  """
137
144
  return details.get("product", {}).get("name")
138
145
 
146
+ @staticmethod
147
+ def extract_url_resolved(details: dict) -> str | None:
148
+ """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
149
+
150
+ The input argument is a dictionary of the following structure:
151
+ {
152
+ "product": {
153
+ "url": str,
154
+ }
155
+ }
156
+ """
157
+ return details.get("product", {}).get("url")
158
+
139
159
  @staticmethod
140
160
  def extract_product_price(details: dict) -> str | None:
141
161
  """Extracts the product price from the product data.
@@ -198,7 +218,9 @@ class ZyteApi(AsyncClient):
198
218
  }
199
219
  }
200
220
  """
201
- return float(details.get("product", {}).get("metadata", {}).get("probability"))
221
+ return float(
222
+ details.get("product", {}).get("metadata", {}).get("probability", 0.0)
223
+ )
202
224
 
203
225
  @staticmethod
204
226
  def extract_html(details: dict) -> str | None:
@@ -209,7 +231,6 @@ class ZyteApi(AsyncClient):
209
231
  "httpResponseBody": base64
210
232
  }
211
233
  """
212
-
213
234
  # Get the Base64-encoded content
214
235
  encoded = details.get("httpResponseBody")
215
236
 
@@ -217,6 +238,7 @@ class ZyteApi(AsyncClient):
217
238
  if isinstance(encoded, str):
218
239
  decoded_bytes = b64decode(encoded)
219
240
 
220
- # Convert bytes to string (assuming UTF-8 encoding)
221
- decoded_string = decoded_bytes.decode("utf-8")
222
- return decoded_string
241
+ # Convert bytes to string (assuming UTF-8 encoding)
242
+ decoded_string = decoded_bytes.decode("utf-8")
243
+ return decoded_string
244
+ return None
fraudcrawler/settings.py CHANGED
@@ -17,7 +17,7 @@ RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
17
17
  # Serp settings
18
18
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
19
19
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
20
- SERP_DEFAULT_COUNTRY_CODES: List[str] = [
20
+ SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
21
21
  # ".com",
22
22
  ]
23
23
 
@@ -75,7 +75,18 @@ PROCESSOR_EMPTY_TOKEN_COUNT = -1
75
75
  PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
76
76
  PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
77
77
 
78
- # Async settings
78
+ # Async workers settings
79
79
  DEFAULT_N_SERP_WKRS = 10
80
80
  DEFAULT_N_ZYTE_WKRS = 10
81
81
  DEFAULT_N_PROC_WKRS = 10
82
+
83
+ # HTTPX client settings
84
+ DEFAULT_HTTPX_TIMEOUT = {
85
+ "timeout": 600,
86
+ "connect": 5.0,
87
+ }
88
+ DEFAULT_HTTPX_LIMITS = {
89
+ "max_connections": 1000,
90
+ "max_keepalive_connections": 100,
91
+ }
92
+ DEFAULT_HTTPX_REDIRECTS = True
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.4.6
3
+ Version: 0.5.0
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -11,9 +11,8 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
- Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
16
14
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
15
+ Requires-Dist: httpx (>=0.28.1,<0.29.0)
17
16
  Requires-Dist: openai (>=1.68.2,<2.0.0)
18
17
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
19
18
  Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
@@ -159,6 +158,10 @@ client.print_available_results()
159
158
  see `CONTRIBUTING.md`
160
159
 
161
160
  ### Async Setup
161
+ The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
162
+
163
+ This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
164
+
162
165
  The following image provides a schematic representation of the package's async setup.
163
166
  ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)
164
167
 
@@ -0,0 +1,22 @@
1
+ fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=suQMnvLIsZO_R0eHZKDWS4u9qnd1ryzPhjGlwcaMD5A,7295
4
+ fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=AKEETrYwKbMy_6YgTdgc6L-VA1iHYOtj3wIqEN3ngO4,26990
8
+ fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
10
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
12
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
+ fraudcrawler/scraping/search.py,sha256=nHMYaSkq9o6Hr4yUDEPguj8IHVcOpws3_XWiAbCVgLg,24062
15
+ fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
+ fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
17
+ fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
18
+ fraudcrawler-0.5.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
+ fraudcrawler-0.5.0.dist-info/METADATA,sha256=H9aq_euzQMD8Ag3gbo3GIrfC4eVl-gGahD_DieQ1oow,6642
20
+ fraudcrawler-0.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
+ fraudcrawler-0.5.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
+ fraudcrawler-0.5.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.0
2
+ Generator: poetry-core 1.9.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any