fraudcrawler 0.5.7__tar.gz → 0.5.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show
  1. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/PKG-INFO +1 -1
  2. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/base.py +6 -5
  3. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/orchestrator.py +6 -4
  4. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/search.py +5 -3
  5. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/pyproject.toml +1 -1
  6. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/LICENSE +0 -0
  7. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/README.md +0 -0
  8. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/__init__.py +0 -0
  9. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/__init__.py +0 -0
  10. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/client.py +0 -0
  11. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/google-languages.json +0 -0
  12. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/google-locations.json +0 -0
  13. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/base/retry.py +0 -0
  14. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/launch_demo_pipeline.py +0 -0
  15. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/processing/__init__.py +0 -0
  16. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/processing/processor.py +0 -0
  17. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/__init__.py +0 -0
  18. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/enrich.py +0 -0
  19. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/url.py +0 -0
  20. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/scraping/zyte.py +0 -0
  21. {fraudcrawler-0.5.7 → fraudcrawler-0.5.8}/fraudcrawler/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.5.7
3
+ Version: 0.5.8
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -247,31 +247,32 @@ class DomainUtils:
247
247
 
248
248
  async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
249
249
  """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
250
-
250
+
251
251
  This method is specifically designed to handle 403 Forbidden errors for domains
252
252
  that may be blocking requests from certain IP ranges (like cloud providers).
253
-
253
+
254
254
  Args:
255
255
  url: The URL to fetch using Zyte proxy mode.
256
256
  zyte_api: An instance of ZyteAPI to use for the request.
257
-
257
+
258
258
  Returns:
259
259
  The HTML content as bytes if successful, None if failed.
260
260
  """
261
261
  try:
262
262
  logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
263
263
  details = await zyte_api.details(url)
264
-
264
+
265
265
  if details and "httpResponseBody" in details:
266
266
  # Decode the base64 content
267
267
  import base64
268
+
268
269
  html_content = base64.b64decode(details["httpResponseBody"])
269
270
  logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
270
271
  return html_content
271
272
  else:
272
273
  logger.warning(f"Zyte proxy request failed for URL: {url}")
273
274
  return None
274
-
275
+
275
276
  except Exception as e:
276
277
  logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
277
278
  return None
@@ -114,8 +114,13 @@ class Orchestrator(ABC):
114
114
  self._owns_http_client = True
115
115
 
116
116
  # Setup the clients
117
+ self._zyteapi = ZyteAPI(
118
+ http_client=self._http_client, api_key=self._zyteapi_key
119
+ )
117
120
  self._search = Search(
118
- http_client=self._http_client, serpapi_key=self._serpapi_key, zyte_api=self._zyteapi
121
+ http_client=self._http_client,
122
+ serpapi_key=self._serpapi_key,
123
+ zyte_api=self._zyteapi,
119
124
  )
120
125
  self._enricher = Enricher(
121
126
  http_client=self._http_client,
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
123
128
  pwd=self._dataforseo_pwd,
124
129
  )
125
130
  self._url_collector = URLCollector()
126
- self._zyteapi = ZyteAPI(
127
- http_client=self._http_client, api_key=self._zyteapi_key
128
- )
129
131
  self._processor = Processor(
130
132
  http_client=self._http_client,
131
133
  api_key=self._openaiapi_key,
@@ -450,7 +450,7 @@ class Toppreise(SearchEngine):
450
450
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
451
451
  search_string=search_string, retry_state=retry_state
452
452
  )
453
-
453
+
454
454
  content = None
455
455
  try:
456
456
  async for attempt in retry:
@@ -463,7 +463,9 @@ class Toppreise(SearchEngine):
463
463
  content = response.content
464
464
  except httpx.HTTPStatusError as e:
465
465
  if e.response.status_code == 403 and self._zyte_api:
466
- logger.warning(f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy")
466
+ logger.warning(
467
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
468
+ )
467
469
  content = await self._unblock_url(url, self._zyte_api)
468
470
  if content is None:
469
471
  raise e # Re-raise if zyte fallback also failed
@@ -471,7 +473,7 @@ class Toppreise(SearchEngine):
471
473
  raise e
472
474
 
473
475
  if content is None:
474
- raise httpx.HTTPStatusError("Failed to fetch content", request=None, response=None)
476
+ raise httpx.HTTPError("Failed to fetch content")
475
477
 
476
478
  # Get external product urls from the content
477
479
  urls = self._get_external_product_urls(content=content)
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.5.7"
7
+ version = "0.5.8"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes