fraudcrawler 0.3.8__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (19) hide show
  1. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/PKG-INFO +1 -1
  2. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/base/base.py +9 -2
  3. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/scraping/serp.py +28 -16
  4. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/settings.py +3 -0
  5. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/pyproject.toml +1 -1
  6. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/LICENSE +0 -0
  7. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/README.md +0 -0
  8. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/__init__.py +0 -0
  9. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/base/__init__.py +0 -0
  10. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/base/client.py +0 -0
  11. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/base/google-languages.json +0 -0
  12. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/base/google-locations.json +0 -0
  13. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/base/orchestrator.py +0 -0
  14. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/launch_demo_pipeline.py +0 -0
  15. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/processing/__init__.py +0 -0
  16. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/processing/processor.py +0 -0
  17. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/scraping/__init__.py +0 -0
  18. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/scraping/enrich.py +0 -0
  19. {fraudcrawler-0.3.8 → fraudcrawler-0.3.10}/fraudcrawler/scraping/zyte.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -6,6 +6,7 @@ from pydantic import (
6
6
  model_validator,
7
7
  )
8
8
  from pydantic_settings import BaseSettings
9
+ import re
9
10
  from typing import List
10
11
 
11
12
  import aiohttp
@@ -48,11 +49,17 @@ class Host(BaseModel):
48
49
  name: str
49
50
  domains: str | List[str]
50
51
 
52
+ @staticmethod
53
+ def _normalize_domain(domain: str) -> str:
54
+ """Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
55
+ domain = domain.strip().lower()
56
+ return re.sub(r"^(https?://)?(www\.)?", "", domain)
57
+
51
58
  @field_validator("domains", mode="before")
52
- def split_domains_if_str(cls, val):
59
+ def normalize_domains(cls, val):
53
60
  if isinstance(val, str):
54
61
  val = val.split(",")
55
- return [dom.strip().lower() for dom in val]
62
+ return [cls._normalize_domain(dom.strip()) for dom in val]
56
63
 
57
64
 
58
65
  class Location(BaseModel):
@@ -4,7 +4,7 @@ from pydantic import BaseModel
4
4
  from typing import List
5
5
  from urllib.parse import urlparse
6
6
 
7
- from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY
7
+ from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
8
8
  from fraudcrawler.base.base import Host, Language, Location, AsyncClient
9
9
  import re
10
10
 
@@ -148,17 +148,32 @@ class SerpApi(AsyncClient):
148
148
  return urls
149
149
 
150
150
  @staticmethod
151
- def _has_included_country_code(url: str, country_code: str) -> bool:
152
- """Determines whether to keep the url based on the country_code.
151
+ def _relevant_country_code(url: str, country_code: str) -> bool:
152
+ """Determines whether the url shows relevant country codes.
153
153
 
154
154
  Args:
155
155
  url: The URL to investigate.
156
156
  country_code: The country code used to filter the products.
157
157
  """
158
- return f".{country_code}" in url.lower() or ".com" in url.lower()
158
+ url = url.lower()
159
+ country_code_relevance = f".{country_code}" in url
160
+ default_relevance = any(cc in url for cc in SERP_DEFAULT_COUNTRY_CODES)
161
+ return country_code_relevance or default_relevance
159
162
 
160
163
  @staticmethod
161
- def _domain_is_present(domain: str, hosts: List[Host]) -> bool:
164
+ def _domain_in_host(domain: str, host: Host) -> bool:
165
+ """Checks if the domain is present in the host.
166
+
167
+ Args:
168
+ domain: The domain to check.
169
+ host: The host to check against.
170
+ """
171
+ return any(
172
+ domain == hst_dom or domain.endswith(f".{hst_dom}")
173
+ for hst_dom in host.domains
174
+ )
175
+
176
+ def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
162
177
  """Checks if the domain is present in the list of hosts.
163
178
 
164
179
  Note:
@@ -171,10 +186,7 @@ class SerpApi(AsyncClient):
171
186
  domain: The domain to check.
172
187
  hosts: The list of hosts to check against.
173
188
  """
174
- for hst_dom in [dom for hst in hosts for dom in hst.domains]:
175
- if domain == hst_dom or domain.endswith(f".{hst_dom}"):
176
- return True
177
- return False
189
+ return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
178
190
 
179
191
  def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
180
192
  """Checks if the domain is in the excluded URLs.
@@ -183,7 +195,7 @@ class SerpApi(AsyncClient):
183
195
  domain: The domain to check.
184
196
  excluded_urls: The list of excluded URLs.
185
197
  """
186
- return self._domain_is_present(domain=domain, hosts=excluded_urls)
198
+ return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
187
199
 
188
200
  def _apply_filters(
189
201
  self,
@@ -203,13 +215,11 @@ class SerpApi(AsyncClient):
203
215
  domain = result.domain
204
216
  # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
205
217
  if marketplaces:
206
- if self._domain_is_present(domain=domain, hosts=marketplaces):
218
+ if self._domain_in_hosts(domain=domain, hosts=marketplaces):
207
219
  return result
208
220
 
209
- # Check if the URL has the included country code
210
- if not self._has_included_country_code(
211
- url=result.url, country_code=location.code
212
- ):
221
+ # Check if the URL has a relevant country_code
222
+ if not self._relevant_country_code(url=result.url, country_code=location.code):
213
223
  result.filtered = True
214
224
  result.filtered_at_stage = "SerpAPI (country code filtering)"
215
225
  return result
@@ -244,7 +254,9 @@ class SerpApi(AsyncClient):
244
254
  if marketplaces:
245
255
  try:
246
256
  marketplace_name = next(
247
- mp.name for mp in marketplaces if domain in [d for d in mp.domains]
257
+ mp.name
258
+ for mp in marketplaces
259
+ if self._domain_in_host(domain=domain, host=mp)
248
260
  )
249
261
  except StopIteration:
250
262
  logger.warning(f'Failed to find marketplace for domain="{domain}".')
@@ -8,6 +8,9 @@ ROOT_DIR = Path(__file__).parents[1]
8
8
  # Serp settings
9
9
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
10
10
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
11
+ SERP_DEFAULT_COUNTRY_CODES = [
12
+ ".com",
13
+ ]
11
14
 
12
15
  # Enrichment settings
13
16
  ENRICHMENT_DEFAULT_LIMIT = 10
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.3.8"
7
+ version = "0.3.10"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes