fraudcrawler 0.3.6__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (19) hide show
  1. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/PKG-INFO +1 -1
  2. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/base.py +2 -2
  3. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/scraping/serp.py +90 -22
  4. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/pyproject.toml +1 -1
  5. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/LICENSE +0 -0
  6. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/README.md +0 -0
  7. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/__init__.py +0 -0
  8. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/__init__.py +0 -0
  9. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/client.py +0 -0
  10. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/google-languages.json +0 -0
  11. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/google-locations.json +0 -0
  12. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/base/orchestrator.py +0 -0
  13. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/launch_demo_pipeline.py +0 -0
  14. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/processing/__init__.py +0 -0
  15. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/processing/processor.py +0 -0
  16. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/scraping/__init__.py +0 -0
  17. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/scraping/enrich.py +0 -0
  18. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/scraping/zyte.py +0 -0
  19. {fraudcrawler-0.3.6 → fraudcrawler-0.3.8}/fraudcrawler/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -51,8 +51,8 @@ class Host(BaseModel):
51
51
  @field_validator("domains", mode="before")
52
52
  def split_domains_if_str(cls, val):
53
53
  if isinstance(val, str):
54
- return [dom.strip() for dom in val.split(",")]
55
- return val
54
+ val = val.split(",")
55
+ return [dom.strip().lower() for dom in val]
56
56
 
57
57
 
58
58
  class Location(BaseModel):
@@ -66,12 +66,12 @@ class SerpApi(AsyncClient):
66
66
  logger.warning(
67
67
  f'Failed to extract domain from url="{url}"; full url is returned'
68
68
  )
69
- return url
69
+ return url.lower()
70
70
 
71
71
  # Remove www. prefix
72
72
  if hostname and hostname.startswith("www."):
73
73
  hostname = hostname[4:]
74
- return hostname
74
+ return hostname.lower()
75
75
 
76
76
  async def _search(
77
77
  self,
@@ -148,7 +148,7 @@ class SerpApi(AsyncClient):
148
148
  return urls
149
149
 
150
150
  @staticmethod
151
- def _keep_url(url: str, country_code: str) -> bool:
151
+ def _has_included_country_code(url: str, country_code: str) -> bool:
152
152
  """Determines whether to keep the url based on the country_code.
153
153
 
154
154
  Args:
@@ -157,11 +157,77 @@ class SerpApi(AsyncClient):
157
157
  """
158
158
  return f".{country_code}" in url.lower() or ".com" in url.lower()
159
159
 
160
+ @staticmethod
161
+ def _domain_is_present(domain: str, hosts: List[Host]) -> bool:
162
+ """Checks if the domain is present in the list of hosts.
163
+
164
+ Note:
165
+ By checking `if domain == hst_dom or domain.endswith(f".{hst_dom}")`
166
+ it also checks for subdomains. For example, if the domain is
167
+ `link.springer.com` and the host domain is `springer.com`,
168
+ it will be detected as being present in the hosts.
169
+
170
+ Args:
171
+ domain: The domain to check.
172
+ hosts: The list of hosts to check against.
173
+ """
174
+ for hst_dom in [dom for hst in hosts for dom in hst.domains]:
175
+ if domain == hst_dom or domain.endswith(f".{hst_dom}"):
176
+ return True
177
+ return False
178
+
179
+ def _is_excluded_url(self, domain: str, excluded_urls: List[Host]) -> bool:
180
+ """Checks if the domain is in the excluded URLs.
181
+
182
+ Args:
183
+ domain: The domain to check.
184
+ excluded_urls: The list of excluded URLs.
185
+ """
186
+ return self._domain_is_present(domain=domain, hosts=excluded_urls)
187
+
188
+ def _apply_filters(
189
+ self,
190
+ result: SerpResult,
191
+ location: Location,
192
+ marketplaces: List[Host] | None = None,
193
+ excluded_urls: List[Host] | None = None,
194
+ ) -> SerpResult:
195
+ """Checks for filters and updates the SerpResult accordingly.
196
+
197
+ Args:
198
+ result: The SerpResult object to check.
199
+ location: The location to use for the query.
200
+ marketplaces: The list of marketplaces to compare the URL against.
201
+ excluded_urls: The list of excluded URLs.
202
+ """
203
+ domain = result.domain
204
+ # Check if the URL is in the marketplaces (if yes, keep the result un-touched)
205
+ if marketplaces:
206
+ if self._domain_is_present(domain=domain, hosts=marketplaces):
207
+ return result
208
+
209
+ # Check if the URL has the included country code
210
+ if not self._has_included_country_code(
211
+ url=result.url, country_code=location.code
212
+ ):
213
+ result.filtered = True
214
+ result.filtered_at_stage = "SerpAPI (country code filtering)"
215
+ return result
216
+
217
+ # Check if the URL is in the excluded URLs
218
+ if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
219
+ result.filtered = True
220
+ result.filtered_at_stage = "SerpAPI (excluded URLs filtering)"
221
+ return result
222
+
223
+ return result
224
+
160
225
  def _create_serp_result(
161
226
  self,
162
227
  url: str,
163
228
  location: Location,
164
- marketplaces: List[Host] | None,
229
+ marketplaces: List[Host] | None = None,
230
+ excluded_urls: List[Host] | None = None,
165
231
  ) -> SerpResult:
166
232
  """From a given url it creates the class:`SerpResult` instance.
167
233
 
@@ -172,30 +238,33 @@ class SerpApi(AsyncClient):
172
238
  location: The location to use for the query.
173
239
  marketplaces: The list of marketplaces to compare the URL against.
174
240
  """
175
- # Filter for county code
176
- filtered = not self._keep_url(url=url, country_code=location.code)
177
- filtered_at_stage = "country code filtering" if filtered else None
178
-
179
241
  # Get marketplace name
180
242
  domain = self._get_domain(url=url)
181
243
  marketplace_name = self._default_marketplace_name
182
- if domain and marketplaces:
244
+ if marketplaces:
183
245
  try:
184
246
  marketplace_name = next(
185
- mp.name
186
- for mp in marketplaces
187
- if domain.lower() in [d.lower() for d in mp.domains]
247
+ mp.name for mp in marketplaces if domain in [d for d in mp.domains]
188
248
  )
189
249
  except StopIteration:
190
250
  logger.warning(f'Failed to find marketplace for domain="{domain}".')
191
- return SerpResult(
251
+
252
+ # Create the SerpResult object
253
+ result = SerpResult(
192
254
  url=url,
193
255
  domain=domain,
194
256
  marketplace_name=marketplace_name,
195
- filtered=filtered,
196
- filtered_at_stage=filtered_at_stage,
197
257
  )
198
258
 
259
+ # Apply filters
260
+ result = self._apply_filters(
261
+ result=result,
262
+ location=location,
263
+ marketplaces=marketplaces,
264
+ excluded_urls=excluded_urls,
265
+ )
266
+ return result
267
+
199
268
  async def apply(
200
269
  self,
201
270
  search_term: str,
@@ -235,17 +304,16 @@ class SerpApi(AsyncClient):
235
304
  # Form the SerpResult objects
236
305
  results = [
237
306
  self._create_serp_result(
238
- url=url, location=location, marketplaces=marketplaces
307
+ url=url,
308
+ location=location,
309
+ marketplaces=marketplaces,
310
+ excluded_urls=excluded_urls,
239
311
  )
240
312
  for url in urls
241
313
  ]
242
314
 
243
- # Filter out the excluded URLs
244
- if excluded_urls:
245
- excluded = [dom for excl in excluded_urls for dom in excl.domains]
246
- results = [res for res in results if res.domain not in excluded]
247
-
315
+ num_non_filtered = len([res for res in results if not res.filtered])
248
316
  logger.info(
249
- f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
317
+ f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
250
318
  )
251
319
  return results
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.3.6"
7
+ version = "0.3.8"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes