fraudcrawler 0.3.9__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/PKG-INFO +1 -1
  2. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/__init__.py +2 -1
  3. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/base.py +9 -2
  4. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/client.py +12 -0
  5. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/orchestrator.py +9 -1
  6. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/scraping/serp.py +196 -31
  7. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/settings.py +3 -2
  8. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/pyproject.toml +1 -1
  9. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/LICENSE +0 -0
  10. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/README.md +0 -0
  11. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/__init__.py +0 -0
  12. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/google-languages.json +0 -0
  13. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/base/google-locations.json +0 -0
  14. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/launch_demo_pipeline.py +0 -0
  15. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/processing/__init__.py +0 -0
  16. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/processing/processor.py +0 -0
  17. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/scraping/__init__.py +0 -0
  18. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/scraping/enrich.py +0 -0
  19. {fraudcrawler-0.3.9 → fraudcrawler-0.4.0}/fraudcrawler/scraping/zyte.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.9
3
+ Version: 0.4.0
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -1,4 +1,4 @@
1
- from fraudcrawler.scraping.serp import SerpApi
1
+ from fraudcrawler.scraping.serp import SerpApi, SearchEngine
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.zyte import ZyteApi
4
4
  from fraudcrawler.processing.processor import Processor
@@ -15,6 +15,7 @@ from fraudcrawler.base.base import (
15
15
 
16
16
  __all__ = [
17
17
  "SerpApi",
18
+ "SearchEngine",
18
19
  "Enricher",
19
20
  "ZyteApi",
20
21
  "Processor",
@@ -6,6 +6,7 @@ from pydantic import (
6
6
  model_validator,
7
7
  )
8
8
  from pydantic_settings import BaseSettings
9
+ import re
9
10
  from typing import List
10
11
 
11
12
  import aiohttp
@@ -48,11 +49,17 @@ class Host(BaseModel):
48
49
  name: str
49
50
  domains: str | List[str]
50
51
 
52
+ @staticmethod
53
+ def _normalize_domain(domain: str) -> str:
54
+ """Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
55
+ domain = domain.strip().lower()
56
+ return re.sub(r"^(https?://)?(www\.)?", "", domain)
57
+
51
58
  @field_validator("domains", mode="before")
52
- def split_domains_if_str(cls, val):
59
+ def normalize_domains(cls, val):
53
60
  if isinstance(val, str):
54
61
  val = val.split(",")
55
- return [dom.strip().lower() for dom in val]
62
+ return [cls._normalize_domain(dom.strip()) for dom in val]
56
63
 
57
64
 
58
65
  class Location(BaseModel):
@@ -11,6 +11,7 @@ import pandas as pd
11
11
  from fraudcrawler.settings import ROOT_DIR
12
12
  from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
13
13
  from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
14
+ from fraudcrawler.scraping.serp import SearchEngine
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17
 
@@ -84,6 +85,7 @@ class FraudCrawlerClient(Orchestrator):
84
85
  prompts: List[Prompt],
85
86
  marketplaces: List[Host] | None = None,
86
87
  excluded_urls: List[Host] | None = None,
88
+ search_engines: List[SearchEngine | str] | None = None,
87
89
  ) -> None:
88
90
  """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
89
91
 
@@ -96,6 +98,7 @@ class FraudCrawlerClient(Orchestrator):
96
98
  marketplaces: The marketplaces to include in the search.
97
99
  excluded_urls: The URLs to exclude from the search.
98
100
  """
101
+ # Handle results files
99
102
  timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
100
103
  filename = self._results_dir / self._filename_template.format(
101
104
  search_term=search_term,
@@ -105,9 +108,18 @@ class FraudCrawlerClient(Orchestrator):
105
108
  )
106
109
  self._results.append(Results(search_term=search_term, filename=filename))
107
110
 
111
+ # Normalize inputs
112
+ nrm_se: List[SearchEngine] = list(SearchEngine)
113
+ if search_engines:
114
+ nrm_se = [
115
+ SearchEngine(se) if isinstance(se, str) else se for se in search_engines
116
+ ]
117
+
118
+ # Run the pipeline by calling the orchestrator's run method
108
119
  asyncio.run(
109
120
  super().run(
110
121
  search_term=search_term,
122
+ search_engines=nrm_se,
111
123
  language=language,
112
124
  location=location,
113
125
  deepness=deepness,
@@ -16,7 +16,7 @@ from fraudcrawler.settings import (
16
16
  DEFAULT_N_PROC_WKRS,
17
17
  )
18
18
  from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
19
- from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
19
+ from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
20
20
 
21
21
  logger = logging.getLogger(__name__)
22
22
 
@@ -387,6 +387,7 @@ class Orchestrator(ABC):
387
387
  queue: asyncio.Queue[dict | None],
388
388
  search_term: str,
389
389
  search_term_type: str,
390
+ search_engines: List[SearchEngine],
390
391
  language: Language,
391
392
  location: Location,
392
393
  num_results: int,
@@ -397,6 +398,7 @@ class Orchestrator(ABC):
397
398
  item = {
398
399
  "search_term": search_term,
399
400
  "search_term_type": search_term_type,
401
+ "search_engines": search_engines,
400
402
  "language": language,
401
403
  "location": location,
402
404
  "num_results": num_results,
@@ -410,6 +412,7 @@ class Orchestrator(ABC):
410
412
  self,
411
413
  queue: asyncio.Queue[dict | None],
412
414
  search_term: str,
415
+ search_engines: List[SearchEngine],
413
416
  language: Language,
414
417
  location: Location,
415
418
  deepness: Deepness,
@@ -429,6 +432,7 @@ class Orchestrator(ABC):
429
432
  await self._add_serp_items_for_search_term(
430
433
  search_term=search_term,
431
434
  search_term_type="initial",
435
+ search_engines=search_engines,
432
436
  num_results=deepness.num_results,
433
437
  **common_kwargs, # type: ignore[arg-type]
434
438
  )
@@ -450,6 +454,7 @@ class Orchestrator(ABC):
450
454
  await self._add_serp_items_for_search_term(
451
455
  search_term=trm,
452
456
  search_term_type="enriched",
457
+ search_engines=search_engines,
453
458
  num_results=enrichment.additional_urls_per_term,
454
459
  **common_kwargs, # type: ignore[arg-type]
455
460
  )
@@ -457,6 +462,7 @@ class Orchestrator(ABC):
457
462
  async def run(
458
463
  self,
459
464
  search_term: str,
465
+ search_engines: List[SearchEngine],
460
466
  language: Language,
461
467
  location: Location,
462
468
  deepness: Deepness,
@@ -469,6 +475,7 @@ class Orchestrator(ABC):
469
475
 
470
476
  Args:
471
477
  search_term: The search term for the query.
478
+ search_engines: The list of search engines to use for the SerpAPI query.
472
479
  language: The language to use for the query.
473
480
  location: The location to use for the query.
474
481
  deepness: The search depth and enrichment details.
@@ -523,6 +530,7 @@ class Orchestrator(ABC):
523
530
  await self._add_serp_items(
524
531
  queue=serp_queue,
525
532
  search_term=search_term,
533
+ search_engines=search_engines,
526
534
  language=language,
527
535
  location=location,
528
536
  deepness=deepness,
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from enum import Enum
2
3
  import logging
3
4
  from pydantic import BaseModel
4
5
  from typing import List
@@ -13,7 +14,6 @@ logger = logging.getLogger(__name__)
13
14
 
14
15
  class SerpResult(BaseModel):
15
16
  """Model for a single search result from SerpApi."""
16
-
17
17
  url: str
18
18
  domain: str
19
19
  marketplace_name: str
@@ -21,12 +21,20 @@ class SerpResult(BaseModel):
21
21
  filtered_at_stage: str | None = None
22
22
 
23
23
 
24
+ class SearchEngine(Enum):
25
+ """Enum for the supported search engines."""
26
+ GOOGLE = "google"
27
+ GOOGLE_SHOPPING = "google_shopping"
28
+
29
+
24
30
  class SerpApi(AsyncClient):
25
31
  """A client to interact with the SerpApi for performing searches."""
26
32
 
27
33
  _endpoint = "https://serpapi.com/search"
28
- _engine = "google"
29
- _default_marketplace_name = "Google"
34
+ _engine_marketplace_names = {
35
+ SearchEngine.GOOGLE.value: "Google",
36
+ SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping"
37
+ }
30
38
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
31
39
 
32
40
  def __init__(
@@ -73,8 +81,42 @@ class SerpApi(AsyncClient):
73
81
  hostname = hostname[4:]
74
82
  return hostname.lower()
75
83
 
84
+ @staticmethod
85
+ def _extract_search_results(response: dict, engine: str) -> List[str]:
86
+ """Extracts search results from the response based on the engine type.
87
+
88
+ Args:
89
+ response: The response from the SerpApi search.
90
+ engine: The search engine used.
91
+
92
+ Returns:
93
+ A list of URLs extracted from the response.
94
+ """
95
+ urls = []
96
+ if engine == SearchEngine.GOOGLE.value:
97
+ # Get the organic_results
98
+ results = response.get("organic_results")
99
+ if results is None:
100
+ logger.warning(f'No SerpAPI results for engine="{engine}".')
101
+ else:
102
+ urls = [url for res in results if (url := res.get("link"))]
103
+
104
+ elif engine == SearchEngine.GOOGLE_SHOPPING.value:
105
+ # Get the shopping_results
106
+ results = response.get("shopping_results")
107
+ if results is None:
108
+ logger.warning(f'No SerpAPI results for engine="{engine}".')
109
+ else:
110
+ urls = [url for res in results if (url := res.get("product_link"))]
111
+
112
+ else:
113
+ raise ValueError(f"Invalid SerpAPI search engine: {engine}")
114
+
115
+ return urls
116
+
76
117
  async def _search(
77
118
  self,
119
+ engine: str,
78
120
  search_string: str,
79
121
  language: Language,
80
122
  location: Location,
@@ -83,6 +125,7 @@ class SerpApi(AsyncClient):
83
125
  """Performs a search using SerpApi and returns the URLs of the results.
84
126
 
85
127
  Args:
128
+ engine: The search engine to use.
86
129
  search_string: The search string (with potentially added site: parameters).
87
130
  language: The language to use for the query ('hl' parameter).
88
131
  location: The location to use for the query ('gl' parameter).
@@ -93,20 +136,35 @@ class SerpApi(AsyncClient):
93
136
  q: The search string (with potentially added site: parameters).
94
137
  google_domain: The Google domain to use for the search (e.g. google.[com]).
95
138
  location_[requested|used]: The location to use for the search.
96
- tbs: The time-based search parameters (e.g. 'ctr:CH&cr:countryCH').
139
+ tbs: The to-be-searched parameters (e.g. 'ctr:CH').
140
+ cr: The country code to limit the search to (e.g. 'countryCH').
97
141
  gl: The country code to use for the search.
98
142
  hl: The language code to use for the search.
99
143
  num: The number of results to return.
100
144
  api_key: The API key to use for the search.
101
145
  """
146
+ if engine not in self._engine_marketplace_names:
147
+ raise ValueError(
148
+ f"Invalid SerpAPI search engine: {engine}. "
149
+ f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
150
+ )
151
+ logger.debug(
152
+ f'Performing SerpAPI search with engine="{engine}", '
153
+ f'q="{search_string}", '
154
+ f'location="{location.name}", '
155
+ f'language="{language.code}", '
156
+ f"num_results={num_results}."
157
+ )
158
+
102
159
  # Setup the parameters
103
160
  params = {
104
- "engine": self._engine,
161
+ "engine": engine,
105
162
  "q": search_string,
106
163
  "google_domain": f"google.{location.code}",
107
164
  "location_requested": location.name,
108
165
  "location_used": location.name,
109
- "tbs": f"ctr:{location.code.upper()}&cr:country{location.code.upper()}",
166
+ "tbs": f"ctr:{location.code.upper()}",
167
+ "cr": f"country{location.code.upper()}",
110
168
  "gl": location.code,
111
169
  "hl": language.code,
112
170
  "num": num_results,
@@ -132,18 +190,11 @@ class SerpApi(AsyncClient):
132
190
  if err is not None:
133
191
  raise err
134
192
 
135
- # Get the organic_results
136
- results = response.get("organic_results")
137
- if results is None:
138
- logger.warning(
139
- f'No organic_results key in SerpAPI results for search_string="{search_string}".'
140
- )
141
- return []
193
+ # Extract the URLs from the response
194
+ urls = self._extract_search_results(response=response, engine=engine)
142
195
 
143
- # Extract urls
144
- urls = [res.get("link") for res in results]
145
196
  logger.debug(
146
- f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
197
+ f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
147
198
  )
148
199
  return urls
149
200
 
@@ -234,6 +285,7 @@ class SerpApi(AsyncClient):
234
285
 
235
286
  def _create_serp_result(
236
287
  self,
288
+ engine: str,
237
289
  url: str,
238
290
  location: Location,
239
291
  marketplaces: List[Host] | None = None,
@@ -244,13 +296,18 @@ class SerpApi(AsyncClient):
244
296
  If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
245
297
 
246
298
  Args:
299
+ engine: The search engine used.
247
300
  url: The URL to be processed.
248
301
  location: The location to use for the query.
249
302
  marketplaces: The list of marketplaces to compare the URL against.
303
+ excluded_urls: The list of excluded URLs.
250
304
  """
251
305
  # Get marketplace name
252
306
  domain = self._get_domain(url=url)
253
- marketplace_name = self._default_marketplace_name
307
+
308
+ # Select marketplace name based on engine
309
+ marketplace_name = self._engine_marketplace_names[engine]
310
+
254
311
  if marketplaces:
255
312
  try:
256
313
  marketplace_name = next(
@@ -277,9 +334,109 @@ class SerpApi(AsyncClient):
277
334
  )
278
335
  return result
279
336
 
337
+ async def _search_google(
338
+ self,
339
+ search_string: str,
340
+ language: Language,
341
+ location: Location,
342
+ num_results: int,
343
+ marketplaces: List[Host] | None = None,
344
+ excluded_urls: List[Host] | None = None,
345
+ ) -> List[SerpResult]:
346
+ """Performs a google search using SerpApi and returns SerpResults.
347
+
348
+ Args:
349
+ search_string: The search string (with potentially added site: parameters).
350
+ language: The language to use for the query ('hl' parameter).
351
+ location: The location to use for the query ('gl' parameter).
352
+ num_results: Max number of results to return.
353
+ marketplaces: The marketplaces to include in the search.
354
+ excluded_urls: The URLs to exclude from the search.
355
+ """
356
+ engine = SearchEngine.GOOGLE.value
357
+
358
+ # Perform the search
359
+ urls = await self._search(
360
+ engine=engine,
361
+ search_string=search_string,
362
+ language=language,
363
+ location=location,
364
+ num_results=num_results,
365
+ )
366
+
367
+ # Create SerpResult objects from the URLs
368
+ results = [
369
+ self._create_serp_result(
370
+ url=url,
371
+ location=location,
372
+ marketplaces=marketplaces,
373
+ excluded_urls=excluded_urls,
374
+ engine=engine,
375
+ )
376
+ for url in urls
377
+ ]
378
+
379
+ logger.debug(
380
+ f'Produced {len(results)} results from google search with q="{search_string}".'
381
+ )
382
+ return results
383
+
384
+ async def _search_google_shopping(
385
+ self,
386
+ search_string: str,
387
+ language: Language,
388
+ location: Location,
389
+ num_results: int,
390
+ marketplaces: List[Host] | None = None,
391
+ excluded_urls: List[Host] | None = None,
392
+ ) -> List[SerpResult]:
393
+ """Performs a google search using SerpApi and returns SerpResults.
394
+
395
+ Args:
396
+ search_string: The search string (with potentially added site: parameters).
397
+ language: The language to use for the query ('hl' parameter).
398
+ location: The location to use for the query ('gl' parameter).
399
+ num_results: Max number of results to return.
400
+ marketplaces: The marketplaces to include in the search.
401
+ excluded_urls: The URLs to exclude from the search.
402
+ """
403
+ engine = SearchEngine.GOOGLE_SHOPPING.value
404
+
405
+ # Perform the search
406
+ urls = await self._search(
407
+ engine=engine,
408
+ search_string=search_string,
409
+ language=language,
410
+ location=location,
411
+ num_results=num_results,
412
+ )
413
+
414
+ # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
415
+ # so we might get more results than requested. This is a known issue with SerpAPI
416
+ # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
417
+ urls = urls[:num_results]
418
+
419
+ # Create SerpResult objects from the URLs
420
+ results = [
421
+ self._create_serp_result(
422
+ url=url,
423
+ location=location,
424
+ marketplaces=marketplaces,
425
+ excluded_urls=excluded_urls,
426
+ engine=engine,
427
+ )
428
+ for url in urls
429
+ ]
430
+
431
+ logger.debug(
432
+ f'Produced {len(results)} results from google shopping search with q="{search_string}".'
433
+ )
434
+ return results
435
+
280
436
  async def apply(
281
437
  self,
282
438
  search_term: str,
439
+ search_engines: List[SearchEngine],
283
440
  language: Language,
284
441
  location: Location,
285
442
  num_results: int,
@@ -305,27 +462,35 @@ class SerpApi(AsyncClient):
305
462
  sites = [dom for host in marketplaces for dom in host.domains]
306
463
  search_string += " site:" + " OR site:".join(s for s in sites)
307
464
 
308
- # Perform the search
309
- urls = await self._search(
310
- search_string=search_string,
311
- language=language,
312
- location=location,
313
- num_results=num_results,
314
- )
465
+ # Initialize the results list
466
+ results: List[SerpResult] = []
315
467
 
316
- # Form the SerpResult objects
317
- results = [
318
- self._create_serp_result(
319
- url=url,
468
+ # Perform the google search
469
+ if SearchEngine.GOOGLE in search_engines:
470
+ ggl_res = await self._search_google(
471
+ search_string=search_string,
472
+ language=language,
320
473
  location=location,
474
+ num_results=num_results,
321
475
  marketplaces=marketplaces,
322
476
  excluded_urls=excluded_urls,
323
477
  )
324
- for url in urls
325
- ]
478
+ results.extend(ggl_res)
479
+
480
+ # Perform the google shopping search
481
+ if SearchEngine.GOOGLE_SHOPPING in search_engines:
482
+ shp_res = await self._search_google_shopping(
483
+ search_string=search_string,
484
+ language=language,
485
+ location=location,
486
+ num_results=num_results,
487
+ marketplaces=marketplaces,
488
+ excluded_urls=excluded_urls,
489
+ )
490
+ results.extend(shp_res)
326
491
 
327
492
  num_non_filtered = len([res for res in results if not res.filtered])
328
493
  logger.info(
329
- f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
494
+ f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
330
495
  )
331
496
  return results
@@ -1,4 +1,5 @@
1
1
  from pathlib import Path
2
+ from typing import List
2
3
 
3
4
  # Generic settings
4
5
  MAX_RETRIES = 3
@@ -8,8 +9,8 @@ ROOT_DIR = Path(__file__).parents[1]
8
9
  # Serp settings
9
10
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
10
11
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
11
- SERP_DEFAULT_COUNTRY_CODES = [
12
- ".com",
12
+ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
13
+ # ".com",
13
14
  ]
14
15
 
15
16
  # Enrichment settings
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.3.9"
7
+ version = "0.4.0"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
File without changes
File without changes