fraudcrawler 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/base/base.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import base64
3
4
  from pydantic import (
4
5
  BaseModel,
5
6
  Field,
@@ -9,7 +10,7 @@ from pydantic import (
9
10
  from pydantic_settings import BaseSettings
10
11
  from urllib.parse import urlparse
11
12
  import re
12
- from typing import Any, Dict, List, TYPE_CHECKING
13
+ from typing import Any, Dict, List
13
14
 
14
15
  import httpx
15
16
 
@@ -23,9 +24,6 @@ from fraudcrawler.settings import (
23
24
  DEFAULT_HTTPX_REDIRECTS,
24
25
  )
25
26
 
26
- if TYPE_CHECKING:
27
- from fraudcrawler.scraping.zyte import ZyteAPI
28
-
29
27
  logger = logging.getLogger(__name__)
30
28
 
31
29
  # Load google locations and languages
@@ -245,33 +243,31 @@ class DomainUtils:
245
243
  hostname = hostname[4:]
246
244
  return hostname.lower()
247
245
 
248
- async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
246
+ async def _unblock_url(self, url: str, zyte_api: Any) -> bytes | None:
249
247
  """Attempts to unblock a URL using Zyte proxy mode when direct access fails.
250
-
248
+
251
249
  This method is specifically designed to handle 403 Forbidden errors for domains
252
250
  that may be blocking requests from certain IP ranges (like cloud providers).
253
-
251
+
254
252
  Args:
255
253
  url: The URL to fetch using Zyte proxy mode.
256
254
  zyte_api: An instance of ZyteAPI to use for the request.
257
-
255
+
258
256
  Returns:
259
257
  The HTML content as bytes if successful, None if failed.
260
258
  """
261
259
  try:
262
260
  logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
263
261
  details = await zyte_api.details(url)
264
-
262
+
265
263
  if details and "httpResponseBody" in details:
266
- # Decode the base64 content
267
- import base64
268
264
  html_content = base64.b64decode(details["httpResponseBody"])
269
265
  logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
270
266
  return html_content
271
267
  else:
272
268
  logger.warning(f"Zyte proxy request failed for URL: {url}")
273
269
  return None
274
-
270
+
275
271
  except Exception as e:
276
272
  logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
277
273
  return None
@@ -114,8 +114,13 @@ class Orchestrator(ABC):
114
114
  self._owns_http_client = True
115
115
 
116
116
  # Setup the clients
117
+ self._zyteapi = ZyteAPI(
118
+ http_client=self._http_client, api_key=self._zyteapi_key
119
+ )
117
120
  self._search = Search(
118
- http_client=self._http_client, serpapi_key=self._serpapi_key, zyte_api=self._zyteapi
121
+ http_client=self._http_client,
122
+ serpapi_key=self._serpapi_key,
123
+ zyte_api=self._zyteapi,
119
124
  )
120
125
  self._enricher = Enricher(
121
126
  http_client=self._http_client,
@@ -123,9 +128,6 @@ class Orchestrator(ABC):
123
128
  pwd=self._dataforseo_pwd,
124
129
  )
125
130
  self._url_collector = URLCollector()
126
- self._zyteapi = ZyteAPI(
127
- http_client=self._http_client, api_key=self._zyteapi_key
128
- )
129
131
  self._processor = Processor(
130
132
  http_client=self._http_client,
131
133
  api_key=self._openaiapi_key,
@@ -97,4 +97,4 @@ def search(search_term: str):
97
97
 
98
98
 
99
99
  if __name__ == "__main__":
100
- search(search_term='Liebherr "TP1410"')
100
+ search(search_term="electric cigarettes")
@@ -131,6 +131,17 @@ class SerpAPI(SearchEngine):
131
131
  search_string += " site:" + " OR site:".join(s for s in sites)
132
132
  return search_string
133
133
 
134
+ @staticmethod
135
+ def _get_google_domain(location: Location) -> str:
136
+ """Gets the Google domain for the given location if they do not use the default pattern google.tld"""
137
+ if location.name == "Brazil":
138
+ return "google.com.br"
139
+ elif location.name == "United Kingdom":
140
+ return "google.co.uk"
141
+ elif location.name == "Argentina":
142
+ return "google.com.ar"
143
+ return f"google.{location.code}"
144
+
134
145
  async def _search(
135
146
  self,
136
147
  search_string: str,
@@ -169,16 +180,19 @@ class SerpAPI(SearchEngine):
169
180
  f"num_results={num_results}."
170
181
  )
171
182
 
172
- # Setup the parameters
183
+ # Get Google domain and country code
184
+ google_domain = self._get_google_domain(location)
185
+ country_code = location.code
186
+
173
187
  params: Dict[str, str | int] = {
174
188
  "engine": engine,
175
189
  "q": search_string,
176
- "google_domain": f"google.{location.code}",
190
+ "google_domain": google_domain,
177
191
  "location_requested": location.name,
178
192
  "location_used": location.name,
179
- "tbs": f"ctr:{location.code.upper()}",
180
- "cr": f"country{location.code.upper()}",
181
- "gl": location.code,
193
+ "tbs": f"ctr:{country_code.upper()}",
194
+ "cr": f"country{country_code.upper()}",
195
+ "gl": country_code,
182
196
  "hl": language.code,
183
197
  "num": num_results,
184
198
  "api_key": self._api_key,
@@ -450,7 +464,7 @@ class Toppreise(SearchEngine):
450
464
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
451
465
  search_string=search_string, retry_state=retry_state
452
466
  )
453
-
467
+
454
468
  content = None
455
469
  try:
456
470
  async for attempt in retry:
@@ -463,7 +477,9 @@ class Toppreise(SearchEngine):
463
477
  content = response.content
464
478
  except httpx.HTTPStatusError as e:
465
479
  if e.response.status_code == 403 and self._zyte_api:
466
- logger.warning(f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy")
480
+ logger.warning(
481
+ f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
482
+ )
467
483
  content = await self._unblock_url(url, self._zyte_api)
468
484
  if content is None:
469
485
  raise e # Re-raise if zyte fallback also failed
@@ -471,7 +487,7 @@ class Toppreise(SearchEngine):
471
487
  raise e
472
488
 
473
489
  if content is None:
474
- raise httpx.HTTPStatusError("Failed to fetch content", request=None, response=None)
490
+ raise httpx.HTTPError("Failed to fetch content")
475
491
 
476
492
  # Get external product urls from the content
477
493
  urls = self._get_external_product_urls(content=content)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.5.7
3
+ Version: 0.5.9
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -1,22 +1,22 @@
1
1
  fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
2
2
  fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=94HTs8RpdpEics9d6o_uDniTRG1CCSO35LDsjY4hp5E,8750
3
+ fraudcrawler/base/base.py,sha256=S5VP1PD2tymZMmVcVmP4FI0VlCQxDCB2gp0vVD6eN8g,8543
4
4
  fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
5
5
  fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
6
  fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=lyrdX_pEq2y3VguXMRMmyEJviGEr5-SnqeIxoJmqWKc,27014
7
+ fraudcrawler/base/orchestrator.py,sha256=28X45XLPlJe2hvff8HTLo-V08LNeS0zMWBHe5W3hk4c,27039
8
8
  fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
9
- fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
10
10
  fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
12
12
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
- fraudcrawler/scraping/search.py,sha256=JQ4nbylYdAk65yDDAatv-qGekRRRNy769VHQgzhqN8Y,24962
14
+ fraudcrawler/scraping/search.py,sha256=5fTRo7Tkkz9REU-u50oZu4kbynjIcIigp9HZ0cS_UCg,25510
15
15
  fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
16
  fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
17
17
  fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
18
- fraudcrawler-0.5.7.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.5.7.dist-info/METADATA,sha256=tMdND63UPo5x2s49o_RMzQzqTSEBdrsv1TqQPL65DaM,6642
20
- fraudcrawler-0.5.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
- fraudcrawler-0.5.7.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.5.7.dist-info/RECORD,,
18
+ fraudcrawler-0.5.9.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
+ fraudcrawler-0.5.9.dist-info/METADATA,sha256=sMszTwcTVxuMGfRhJJZKLAPShnY4zDU6gShAa6k3tPg,6642
20
+ fraudcrawler-0.5.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
+ fraudcrawler-0.5.9.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
+ fraudcrawler-0.5.9.dist-info/RECORD,,