fraudcrawler 0.3.7__tar.gz → 0.3.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (19) hide show
  1. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/PKG-INFO +1 -1
  2. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/base.py +2 -2
  3. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/serp.py +102 -46
  4. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/settings.py +3 -0
  5. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/pyproject.toml +1 -1
  6. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/LICENSE +0 -0
  7. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/README.md +0 -0
  8. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/__init__.py +0 -0
  9. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/__init__.py +0 -0
  10. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/client.py +0 -0
  11. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/google-languages.json +0 -0
  12. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/google-locations.json +0 -0
  13. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/base/orchestrator.py +0 -0
  14. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/launch_demo_pipeline.py +0 -0
  15. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/processing/__init__.py +0 -0
  16. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/processing/processor.py +0 -0
  17. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/__init__.py +0 -0
  18. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/enrich.py +0 -0
  19. {fraudcrawler-0.3.7 → fraudcrawler-0.3.9}/fraudcrawler/scraping/zyte.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.7
3
+ Version: 0.3.9
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -51,8 +51,8 @@ class Host(BaseModel):
51
51
  @field_validator("domains", mode="before")
52
52
  def split_domains_if_str(cls, val):
53
53
  if isinstance(val, str):
54
- return [dom.strip() for dom in val.split(",")]
55
- return val
54
+ val = val.split(",")
55
+ return [dom.strip().lower() for dom in val]
56
56
 
57
57
 
58
58
  class Location(BaseModel):
@@ -4,7 +4,7 @@ from pydantic import BaseModel
4
4
  from typing import List
5
5
  from urllib.parse import urlparse
6
6
 
7
- from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY
7
+ from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
8
8
  from fraudcrawler.base.base import Host, Language, Location, AsyncClient
9
9
  import re
10
10
 
@@ -66,12 +66,12 @@ class SerpApi(AsyncClient):
66
66
  logger.warning(
67
67
  f'Failed to extract domain from url="{url}"; full url is returned'
68
68
  )
69
- return url
69
+ return url.lower()
70
70
 
71
71
  # Remove www. prefix
72
72
  if hostname and hostname.startswith("www."):
73
73
  hostname = hostname[4:]
74
- return hostname
74
+ return hostname.lower()
75
75
 
76
76
  async def _search(
77
77
  self,
@@ -148,20 +148,96 @@ class SerpApi(AsyncClient):
148
148
  return urls
149
149
 
150
150
  @staticmethod
151
- def _keep_url(url: str, country_code: str) -> bool:
152
- """Determines whether to keep the url based on the country_code.
151
+ def _relevant_country_code(url: str, country_code: str) -> bool:
152
+ """Determines whether the url shows relevant country codes.
153
153
 
154
154
  Args:
155
155
  url: The URL to investigate.
156
156
  country_code: The country code used to filter the products.
157
157
  """
158
- return f".{country_code}" in url.lower() or ".com" in url.lower()
158
+ url = url.lower()
159
+ country_code_relevance = f".{country_code}" in url
160
+ default_relevance = any(cc in url for cc in SERP_DEFAULT_COUNTRY_CODES)
161
+ return country_code_relevance or default_relevance
162
+
163
+ @staticmethod
164
+ def _domain_in_host(domain: str, host: Host) -> bool:
165
+ """Checks if the domain is present in the host.
166
+
167
+ Args:
168
+ domain: The domain to check.
169
+ host: The host to check against.
170
+ """
171
+ return any(
172
+ domain == hst_dom or domain.endswith(f".{hst_dom}")
173
+ for hst_dom in host.domains
174
+ )
175
+
176
+ def _domain_in_hosts(self, domain: str, hosts: List[Host]) -> bool:
177
+ """Checks if the domain is present in the list of hosts.
178
+
179
+ Note:
180
+ By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
181
+ it also checks for subdomains. For example, if the domain is
182
+ `link.springer.com` and the host domain is `springer.com`,
183
+ it will be detected as being present in the hosts.
184
+
185
+ Args:
186
+ domain: The domain to check.
187
+ hosts: The list of hosts to check against.
188
+ """
189
+ return any(self._domain_in_host(domain=domain, host=hst) for hst in hosts)
190
+
191
+ def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
192
+ """Checks if the domain is in the excluded URLs.
193
+
194
+ Args:
195
+ domain: The domain to check.
196
+ excluded_urls: The list of excluded URLs.
197
+ """
198
+ return self._domain_in_hosts(domain=domain, hosts=excluded_urls)
199
+
200
+ def _apply_filters(
201
+ self,
202
+ result: SerpResult,
203
+ location: Location,
204
+ marketplaces: List[Host] | None = None,
205
+ excluded_urls: List[Host] | None = None,
206
+ ) -> SerpResult:
207
+ """Checks for filters and updates the SerpResult accordingly.
208
+
209
+ Args:
210
+ result: The SerpResult object to check.
211
+ location: The location to use for the query.
212
+ marketplaces: The list of marketplaces to compare the URL against.
213
+ excluded_urls: The list of excluded URLs.
214
+ """
215
+ domain = result.domain
216
+ # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
217
+ if marketplaces:
218
+ if self._domain_in_hosts(domain=domain, hosts=marketplaces):
219
+ return result
220
+
221
+ # Check if the URL has a relevant country_code
222
+ if not self._relevant_country_code(url=result.url, country_code=location.code):
223
+ result.filtered = True
224
+ result.filtered_at_stage = "SerpAPI (country code filtering)"
225
+ return result
226
+
227
+ # Check if the URL is in the excluded URLs
228
+ if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
229
+ result.filtered = True
230
+ result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
231
+ return result
232
+
233
+ return result
159
234
 
160
235
  def _create_serp_result(
161
236
  self,
162
237
  url: str,
163
238
  location: Location,
164
- marketplaces: List[Host] | None,
239
+ marketplaces: List[Host] | None = None,
240
+ excluded_urls: List[Host] | None = None,
165
241
  ) -> SerpResult:
166
242
  """From a given url it creates the class:`SerpResult` instance.
167
243
 
@@ -172,50 +248,34 @@ class SerpApi(AsyncClient):
172
248
  location: The location to use for the query.
173
249
  marketplaces: The list of marketplaces to compare the URL against.
174
250
  """
175
- # Filter for county code
176
- filtered = not self._keep_url(url=url, country_code=location.code)
177
- filtered_at_stage = "country code filtering" if filtered else None
178
-
179
251
  # Get marketplace name
180
252
  domain = self._get_domain(url=url)
181
253
  marketplace_name = self._default_marketplace_name
182
- if domain and marketplaces:
254
+ if marketplaces:
183
255
  try:
184
256
  marketplace_name = next(
185
257
  mp.name
186
258
  for mp in marketplaces
187
- if domain.lower() in [d.lower() for d in mp.domains]
259
+ if self._domain_in_host(domain=domain, host=mp)
188
260
  )
189
261
  except StopIteration:
190
262
  logger.warning(f'Failed to find marketplace for domain="{domain}".')
191
- return SerpResult(
263
+
264
+ # Create the SerpResult object
265
+ result = SerpResult(
192
266
  url=url,
193
267
  domain=domain,
194
268
  marketplace_name=marketplace_name,
195
- filtered=filtered,
196
- filtered_at_stage=filtered_at_stage,
197
269
  )
198
270
 
199
- @staticmethod
200
- def _is_excluded(domain: str, excluded_urls: List[Host]) -> bool:
201
- """Checks if the domain is in the excluded URLs.
202
-
203
- Note:
204
- By checking `if dom == excl or dom.endswith(f".{excl}")` we also
205
- check for subdomains. For example, if the domain is
206
- `link.springer.com` and the excluded URL is `springer.com`,
207
- it will be excluded.
208
-
209
- Args:
210
- domain: The domain to check.
211
- excluded_urls: The list of excluded URLs.
212
- """
213
- dom = domain.lower()
214
- excl_doms = [dom.lower() for excl in excluded_urls for dom in excl.domains]
215
- for excl in excl_doms:
216
- if dom == excl or dom.endswith(f".{excl}"):
217
- return True
218
- return False
271
+ # Apply filters
272
+ result = self._apply_filters(
273
+ result=result,
274
+ location=location,
275
+ marketplaces=marketplaces,
276
+ excluded_urls=excluded_urls,
277
+ )
278
+ return result
219
279
 
220
280
  async def apply(
221
281
  self,
@@ -256,20 +316,16 @@ class SerpApi(AsyncClient):
256
316
  # Form the SerpResult objects
257
317
  results = [
258
318
  self._create_serp_result(
259
- url=url, location=location, marketplaces=marketplaces
319
+ url=url,
320
+ location=location,
321
+ marketplaces=marketplaces,
322
+ excluded_urls=excluded_urls,
260
323
  )
261
324
  for url in urls
262
325
  ]
263
326
 
264
- # Filter out the excluded URLs
265
- if excluded_urls:
266
- results = [
267
- res
268
- for res in results
269
- if not self._is_excluded(res.domain, excluded_urls)
270
- ]
271
-
327
+ num_non_filtered = len([res for res in results if not res.filtered])
272
328
  logger.info(
273
- f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
329
+ f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
274
330
  )
275
331
  return results
@@ -8,6 +8,9 @@ ROOT_DIR = Path(__file__).parents[1]
8
8
  # Serp settings
9
9
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
10
10
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
11
+ SERP_DEFAULT_COUNTRY_CODES = [
12
+ ".com",
13
+ ]
11
14
 
12
15
  # Enrichment settings
13
16
  ENRICHMENT_DEFAULT_LIMIT = 10
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.3.7"
7
+ version = "0.3.9"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes