fraudcrawler 0.3.10__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/__init__.py CHANGED
@@ -1,8 +1,8 @@
1
- from fraudcrawler.scraping.serp import SerpApi
1
+ from fraudcrawler.scraping.serp import SerpApi, SearchEngine
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.zyte import ZyteApi
4
4
  from fraudcrawler.processing.processor import Processor
5
- from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
5
+ from fraudcrawler.base.orchestrator import Orchestrator
6
6
  from fraudcrawler.base.client import FraudCrawlerClient
7
7
  from fraudcrawler.base.base import (
8
8
  Deepness,
@@ -11,10 +11,12 @@ from fraudcrawler.base.base import (
11
11
  Language,
12
12
  Location,
13
13
  Prompt,
14
+ ProductItem,
14
15
  )
15
16
 
16
17
  __all__ = [
17
18
  "SerpApi",
19
+ "SearchEngine",
18
20
  "Enricher",
19
21
  "ZyteApi",
20
22
  "Processor",
fraudcrawler/base/base.py CHANGED
@@ -2,12 +2,13 @@ import json
2
2
  import logging
3
3
  from pydantic import (
4
4
  BaseModel,
5
+ Field,
5
6
  field_validator,
6
7
  model_validator,
7
8
  )
8
9
  from pydantic_settings import BaseSettings
9
10
  import re
10
- from typing import List
11
+ from typing import List, Dict
11
12
 
12
13
  import aiohttp
13
14
 
@@ -114,12 +115,39 @@ class Deepness(BaseModel):
114
115
  enrichment: Enrichment | None = None
115
116
 
116
117
 
118
+ class ProductItem(BaseModel):
119
+ """Model representing a product item."""
120
+
121
+ # Serp/Enrich parameters
122
+ search_term: str
123
+ search_term_type: str
124
+ url: str
125
+ marketplace_name: str
126
+ domain: str
127
+
128
+ # Zyte parameters
129
+ product_name: str | None = None
130
+ product_price: str | None = None
131
+ product_description: str | None = None
132
+ product_images: List[str] | None = None
133
+ probability: float | None = None
134
+ html: str | None = None
135
+ html_clean: str | None = None
136
+
137
+ # Processor parameters are set dynamic so we must allow extra fields
138
+ classifications: Dict[str, int] = Field(default_factory=dict)
139
+
140
+ # Filtering parameters
141
+ filtered: bool = False
142
+ filtered_at_stage: str | None = None
143
+
144
+
117
145
  class Prompt(BaseModel):
118
146
  """Model for prompts."""
119
147
 
120
148
  name: str
121
- context: str
122
149
  system_prompt: str
150
+ product_item_fields: List[str]
123
151
  allowed_classes: List[int]
124
152
 
125
153
  @field_validator("allowed_classes", mode="before")
@@ -129,6 +157,17 @@ class Prompt(BaseModel):
129
157
  raise ValueError("all values in allowed_classes must be positive integers.")
130
158
  return val
131
159
 
160
+ @field_validator("product_item_fields", mode="before")
161
+ def validate_product_item_fields(cls, val):
162
+ """Ensure all product_item_fields are valid ProductItem attributes."""
163
+ valid_fields = set(ProductItem.model_fields.keys())
164
+ for field in val:
165
+ if field not in valid_fields:
166
+ raise ValueError(
167
+ f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
168
+ )
169
+ return val
170
+
132
171
 
133
172
  class AsyncClient:
134
173
  """Base class for sub-classes using async HTTP requests."""
@@ -9,8 +9,17 @@ from typing import List
9
9
  import pandas as pd
10
10
 
11
11
  from fraudcrawler.settings import ROOT_DIR
12
- from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
13
- from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
12
+ from fraudcrawler.base.base import (
13
+ Setup,
14
+ Language,
15
+ Location,
16
+ Deepness,
17
+ Host,
18
+ Prompt,
19
+ ProductItem,
20
+ )
21
+ from fraudcrawler.base.orchestrator import Orchestrator
22
+ from fraudcrawler.scraping.serp import SearchEngine
14
23
 
15
24
  logger = logging.getLogger(__name__)
16
25
 
@@ -84,6 +93,7 @@ class FraudCrawlerClient(Orchestrator):
84
93
  prompts: List[Prompt],
85
94
  marketplaces: List[Host] | None = None,
86
95
  excluded_urls: List[Host] | None = None,
96
+ search_engines: List[SearchEngine | str] | None = None,
87
97
  ) -> None:
88
98
  """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
89
99
 
@@ -96,6 +106,7 @@ class FraudCrawlerClient(Orchestrator):
96
106
  marketplaces: The marketplaces to include in the search.
97
107
  excluded_urls: The URLs to exclude from the search.
98
108
  """
109
+ # Handle results files
99
110
  timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
100
111
  filename = self._results_dir / self._filename_template.format(
101
112
  search_term=search_term,
@@ -105,9 +116,18 @@ class FraudCrawlerClient(Orchestrator):
105
116
  )
106
117
  self._results.append(Results(search_term=search_term, filename=filename))
107
118
 
119
+ # Normalize inputs
120
+ nrm_se: List[SearchEngine] = list(SearchEngine)
121
+ if search_engines:
122
+ nrm_se = [
123
+ SearchEngine(se) if isinstance(se, str) else se for se in search_engines
124
+ ]
125
+
126
+ # Run the pipeline by calling the orchestrator's run method
108
127
  asyncio.run(
109
128
  super().run(
110
129
  search_term=search_term,
130
+ search_engines=nrm_se,
111
131
  language=language,
112
132
  location=location,
113
133
  deepness=deepness,
@@ -1,12 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import asyncio
3
3
  import logging
4
- from pydantic import BaseModel, Field
5
4
  from typing import Dict, List, Set, cast
5
+ from bs4 import BeautifulSoup
6
6
 
7
7
  from fraudcrawler.settings import (
8
8
  PROCESSOR_DEFAULT_MODEL,
9
9
  PROCESSOR_DEFAULT_IF_MISSING,
10
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
10
11
  MAX_RETRIES,
11
12
  RETRY_DELAY,
12
13
  )
@@ -15,37 +16,19 @@ from fraudcrawler.settings import (
15
16
  DEFAULT_N_ZYTE_WKRS,
16
17
  DEFAULT_N_PROC_WKRS,
17
18
  )
18
- from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
19
- from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
19
+ from fraudcrawler.base.base import (
20
+ Deepness,
21
+ Host,
22
+ Language,
23
+ Location,
24
+ Prompt,
25
+ ProductItem,
26
+ )
27
+ from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
20
28
 
21
29
  logger = logging.getLogger(__name__)
22
30
 
23
31
 
24
- class ProductItem(BaseModel):
25
- """Model representing a product item."""
26
-
27
- # Serp/Enrich parameters
28
- search_term: str
29
- search_term_type: str
30
- url: str
31
- marketplace_name: str
32
- domain: str
33
-
34
- # Zyte parameters
35
- product_name: str | None = None
36
- product_price: str | None = None
37
- product_description: str | None = None
38
- product_images: List[str] | None = None
39
- probability: float | None = None
40
-
41
- # Processor parameters are set dynamic so we must allow extra fields
42
- classifications: Dict[str, int] = Field(default_factory=dict)
43
-
44
- # Filtering parameters
45
- filtered: bool = False
46
- filtered_at_stage: str | None = None
47
-
48
-
49
32
  class Orchestrator(ABC):
50
33
  """Abstract base class for orchestrating the different actors (crawling, processing).
51
34
 
@@ -231,15 +214,16 @@ class Orchestrator(ABC):
231
214
  product.probability = self._zyteapi.extract_probability(
232
215
  details=details
233
216
  )
234
-
217
+ product.html = self._zyteapi.extract_html(details=details)
218
+ if product.html:
219
+ soup = BeautifulSoup(product.html, "html.parser")
220
+ product.html_clean = soup.get_text(separator=" ", strip=True)
235
221
  # Filter the product based on the probability threshold
236
222
  if not self._zyteapi.keep_product(details=details):
237
223
  product.filtered = True
238
224
  product.filtered_at_stage = "Zyte probability threshold"
239
-
240
225
  except Exception as e:
241
226
  logger.warning(f"Error executing Zyte API search: {e}.")
242
-
243
227
  await queue_out.put(product)
244
228
  queue_in.task_done()
245
229
 
@@ -269,19 +253,26 @@ class Orchestrator(ABC):
269
253
  if not product.filtered:
270
254
  try:
271
255
  url = product.url
272
- name = product.product_name
273
- description = product.product_description
274
-
275
256
  # Run all the configured prompts
276
257
  for prompt in prompts:
258
+ # Dynamically build product_details string
259
+ details = []
260
+ for field in prompt.product_item_fields:
261
+ value = getattr(product, field, None)
262
+ if value is not None:
263
+ details.append(
264
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
265
+ field_name=field, field_value=value
266
+ )
267
+ )
268
+ product_details = "\n\n".join(details)
277
269
  logger.debug(
278
- f"Classify product {name} with prompt {prompt.name}"
270
+ f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
279
271
  )
280
272
  classification = await self._processor.classify(
281
273
  prompt=prompt,
282
274
  url=url,
283
- name=name,
284
- description=description,
275
+ product_details=product_details,
285
276
  )
286
277
  product.classifications[prompt.name] = classification
287
278
  except Exception as e:
@@ -387,6 +378,7 @@ class Orchestrator(ABC):
387
378
  queue: asyncio.Queue[dict | None],
388
379
  search_term: str,
389
380
  search_term_type: str,
381
+ search_engines: List[SearchEngine],
390
382
  language: Language,
391
383
  location: Location,
392
384
  num_results: int,
@@ -397,6 +389,7 @@ class Orchestrator(ABC):
397
389
  item = {
398
390
  "search_term": search_term,
399
391
  "search_term_type": search_term_type,
392
+ "search_engines": search_engines,
400
393
  "language": language,
401
394
  "location": location,
402
395
  "num_results": num_results,
@@ -410,6 +403,7 @@ class Orchestrator(ABC):
410
403
  self,
411
404
  queue: asyncio.Queue[dict | None],
412
405
  search_term: str,
406
+ search_engines: List[SearchEngine],
413
407
  language: Language,
414
408
  location: Location,
415
409
  deepness: Deepness,
@@ -429,6 +423,7 @@ class Orchestrator(ABC):
429
423
  await self._add_serp_items_for_search_term(
430
424
  search_term=search_term,
431
425
  search_term_type="initial",
426
+ search_engines=search_engines,
432
427
  num_results=deepness.num_results,
433
428
  **common_kwargs, # type: ignore[arg-type]
434
429
  )
@@ -450,6 +445,7 @@ class Orchestrator(ABC):
450
445
  await self._add_serp_items_for_search_term(
451
446
  search_term=trm,
452
447
  search_term_type="enriched",
448
+ search_engines=search_engines,
453
449
  num_results=enrichment.additional_urls_per_term,
454
450
  **common_kwargs, # type: ignore[arg-type]
455
451
  )
@@ -457,6 +453,7 @@ class Orchestrator(ABC):
457
453
  async def run(
458
454
  self,
459
455
  search_term: str,
456
+ search_engines: List[SearchEngine],
460
457
  language: Language,
461
458
  location: Location,
462
459
  deepness: Deepness,
@@ -469,6 +466,7 @@ class Orchestrator(ABC):
469
466
 
470
467
  Args:
471
468
  search_term: The search term for the query.
469
+ search_engines: The list of search engines to use for the SerpAPI query.
472
470
  language: The language to use for the query.
473
471
  location: The location to use for the query.
474
472
  deepness: The search depth and enrichment details.
@@ -523,6 +521,7 @@ class Orchestrator(ABC):
523
521
  await self._add_serp_items(
524
522
  queue=serp_queue,
525
523
  search_term=search_term,
524
+ search_engines=search_engines,
526
525
  language=language,
527
526
  location=location,
528
527
  deepness=deepness,
@@ -16,38 +16,39 @@ def main():
16
16
  search_term = "Kühlschrank"
17
17
  language = Language(name="German")
18
18
  location = Location(name="Switzerland")
19
- deepness = Deepness(num_results=20)
19
+ deepness = Deepness(num_results=10)
20
20
  prompts = [
21
21
  Prompt(
22
- name="relevance",
23
- context="This organization is interested in checking the energy efficiency of certain devices.",
22
+ name="availability",
24
23
  system_prompt=(
25
- "You are a helpful and intelligent assistant. Your task is to classify any given product "
26
- "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
24
+ "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
25
+ "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
27
26
  "You must consider all aspects of the given context and make a binary decision accordingly. "
28
- "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
27
+ "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
28
+ "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
29
29
  "Respond only with the number 1 or 0."
30
30
  ),
31
+ product_item_fields=["product_name", "html_clean"],
31
32
  allowed_classes=[0, 1],
32
33
  ),
33
- Prompt(
34
- name="seriousness",
35
- context="This organization is interested in checking the energy efficiency of certain devices.",
36
- system_prompt=(
37
- "You are an intelligent and discerning assistant. Your task is to classify each item as either "
38
- "a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
39
- " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
40
- "within an online shop or marketplace.\n"
41
- " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
42
- " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
43
- "exact product itself, classify as 0.\n"
44
- " - Advertisements: Promotional content that doesn't directly sell a product.\n"
45
- " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
46
- " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
47
- "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
48
- ),
49
- allowed_classes=[0, 1],
50
- ),
34
+ # Prompt(
35
+ # name="seriousness",
36
+ # system_prompt=(
37
+ # "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
38
+ # "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
39
+ # " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
40
+ # "within an online shop or marketplace.\n"
41
+ # " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
42
+ # " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
43
+ # "exact product itself, classify as 0.\n"
44
+ # " - Advertisements: Promotional content that doesn't directly sell a product.\n"
45
+ # " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
46
+ # " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
47
+ # "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
48
+ # ),
49
+ # product_item_fields=["product_name", "product_description"],
50
+ # allowed_classes=[0, 1],
51
+ # ),
51
52
  ]
52
53
  # # Optional: Add tern ENRICHEMENT
53
54
  # from fraudcrawler import Enrichment
@@ -52,42 +52,34 @@ class Processor:
52
52
  raise ValueError("Empty response from OpenAI API")
53
53
  return content
54
54
 
55
- async def classify(
56
- self, prompt: Prompt, url: str, name: str | None, description: str | None
57
- ) -> int:
58
- """A generic classification method that classified a product based on a prompt object.
55
+ async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
56
+ """A generic classification method that classifies a product based on a prompt object.
59
57
 
60
58
  Args:
61
- prompt: A dictionary with keys "system_prompt", "user_prompt", etc.
59
+ prompt: A dictionary with keys "system_prompt", etc.
62
60
  url: Product URL (often used in the user_prompt).
63
- name: Product name (often used in the user_prompt).
64
- description: Product description (often used in the user_prompt).
61
+ product_details: String with product details, formatted per prompt.product_item_fields.
65
62
 
66
63
  Note:
67
64
  This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
68
- - 'name' or 'description' is None
65
+ - product_details is empty
69
66
  - an error occurs during the API call
70
67
  - if the response isn't in allowed_classes.
71
68
  """
72
69
  # If required fields are missing, return the prompt's default fallback if provided.
73
- if name is None or description is None:
74
- logger.warning(
75
- f"Missing required fields for classification: name='{name}', description='{description}'"
76
- )
70
+ if not product_details:
71
+ logger.warning("Missing required product_details for classification.")
77
72
  return self._default_if_missing
78
73
 
79
74
  # Substitute placeholders in user_prompt with the relevant arguments
80
75
  user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
81
- context=prompt.context,
82
- url=url,
83
- name=name,
84
- description=description,
76
+ product_details=product_details,
85
77
  )
86
78
 
87
79
  # Call the OpenAI API
88
80
  try:
89
81
  logger.debug(
90
- f'Calling OpenAI API for classification (name="{name}", prompt="{prompt.name}")'
82
+ f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
91
83
  )
92
84
  content = await self._call_openai_api(
93
85
  system_prompt=prompt.system_prompt,
@@ -104,12 +96,12 @@ class Processor:
104
96
  return self._default_if_missing
105
97
 
106
98
  logger.info(
107
- f'Classification for "{name}" (prompt={prompt.name}): {classification}'
99
+ f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
108
100
  )
109
101
  return classification
110
102
 
111
103
  except Exception as e:
112
104
  logger.error(
113
- f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
105
+ f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
114
106
  )
115
107
  return self._default_if_missing
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from enum import Enum
2
3
  import logging
3
4
  from pydantic import BaseModel
4
5
  from typing import List
@@ -21,12 +22,21 @@ class SerpResult(BaseModel):
21
22
  filtered_at_stage: str | None = None
22
23
 
23
24
 
25
+ class SearchEngine(Enum):
26
+ """Enum for the supported search engines."""
27
+
28
+ GOOGLE = "google"
29
+ GOOGLE_SHOPPING = "google_shopping"
30
+
31
+
24
32
  class SerpApi(AsyncClient):
25
33
  """A client to interact with the SerpApi for performing searches."""
26
34
 
27
35
  _endpoint = "https://serpapi.com/search"
28
- _engine = "google"
29
- _default_marketplace_name = "Google"
36
+ _engine_marketplace_names = {
37
+ SearchEngine.GOOGLE.value: "Google",
38
+ SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
39
+ }
30
40
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
31
41
 
32
42
  def __init__(
@@ -73,8 +83,42 @@ class SerpApi(AsyncClient):
73
83
  hostname = hostname[4:]
74
84
  return hostname.lower()
75
85
 
86
+ @staticmethod
87
+ def _extract_search_results(response: dict, engine: str) -> List[str]:
88
+ """Extracts search results from the response based on the engine type.
89
+
90
+ Args:
91
+ response: The response from the SerpApi search.
92
+ engine: The search engine used.
93
+
94
+ Returns:
95
+ A list of URLs extracted from the response.
96
+ """
97
+ urls = []
98
+ if engine == SearchEngine.GOOGLE.value:
99
+ # Get the organic_results
100
+ results = response.get("organic_results")
101
+ if results is None:
102
+ logger.warning(f'No SerpAPI results for engine="{engine}".')
103
+ else:
104
+ urls = [url for res in results if (url := res.get("link"))]
105
+
106
+ elif engine == SearchEngine.GOOGLE_SHOPPING.value:
107
+ # Get the shopping_results
108
+ results = response.get("shopping_results")
109
+ if results is None:
110
+ logger.warning(f'No SerpAPI results for engine="{engine}".')
111
+ else:
112
+ urls = [url for res in results if (url := res.get("product_link"))]
113
+
114
+ else:
115
+ raise ValueError(f"Invalid SerpAPI search engine: {engine}")
116
+
117
+ return urls
118
+
76
119
  async def _search(
77
120
  self,
121
+ engine: str,
78
122
  search_string: str,
79
123
  language: Language,
80
124
  location: Location,
@@ -83,6 +127,7 @@ class SerpApi(AsyncClient):
83
127
  """Performs a search using SerpApi and returns the URLs of the results.
84
128
 
85
129
  Args:
130
+ engine: The search engine to use.
86
131
  search_string: The search string (with potentially added site: parameters).
87
132
  language: The language to use for the query ('hl' parameter).
88
133
  location: The location to use for the query ('gl' parameter).
@@ -93,20 +138,35 @@ class SerpApi(AsyncClient):
93
138
  q: The search string (with potentially added site: parameters).
94
139
  google_domain: The Google domain to use for the search (e.g. google.[com]).
95
140
  location_[requested|used]: The location to use for the search.
96
- tbs: The time-based search parameters (e.g. 'ctr:CH&cr:countryCH').
141
+ tbs: The to-be-searched parameters (e.g. 'ctr:CH').
142
+ cr: The country code to limit the search to (e.g. 'countryCH').
97
143
  gl: The country code to use for the search.
98
144
  hl: The language code to use for the search.
99
145
  num: The number of results to return.
100
146
  api_key: The API key to use for the search.
101
147
  """
148
+ if engine not in self._engine_marketplace_names:
149
+ raise ValueError(
150
+ f"Invalid SerpAPI search engine: {engine}. "
151
+ f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
152
+ )
153
+ logger.debug(
154
+ f'Performing SerpAPI search with engine="{engine}", '
155
+ f'q="{search_string}", '
156
+ f'location="{location.name}", '
157
+ f'language="{language.code}", '
158
+ f"num_results={num_results}."
159
+ )
160
+
102
161
  # Setup the parameters
103
162
  params = {
104
- "engine": self._engine,
163
+ "engine": engine,
105
164
  "q": search_string,
106
165
  "google_domain": f"google.{location.code}",
107
166
  "location_requested": location.name,
108
167
  "location_used": location.name,
109
- "tbs": f"ctr:{location.code.upper()}&cr:country{location.code.upper()}",
168
+ "tbs": f"ctr:{location.code.upper()}",
169
+ "cr": f"country{location.code.upper()}",
110
170
  "gl": location.code,
111
171
  "hl": language.code,
112
172
  "num": num_results,
@@ -132,18 +192,11 @@ class SerpApi(AsyncClient):
132
192
  if err is not None:
133
193
  raise err
134
194
 
135
- # Get the organic_results
136
- results = response.get("organic_results")
137
- if results is None:
138
- logger.warning(
139
- f'No organic_results key in SerpAPI results for search_string="{search_string}".'
140
- )
141
- return []
195
+ # Extract the URLs from the response
196
+ urls = self._extract_search_results(response=response, engine=engine)
142
197
 
143
- # Extract urls
144
- urls = [res.get("link") for res in results]
145
198
  logger.debug(
146
- f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
199
+ f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
147
200
  )
148
201
  return urls
149
202
 
@@ -234,6 +287,7 @@ class SerpApi(AsyncClient):
234
287
 
235
288
  def _create_serp_result(
236
289
  self,
290
+ engine: str,
237
291
  url: str,
238
292
  location: Location,
239
293
  marketplaces: List[Host] | None = None,
@@ -244,13 +298,18 @@ class SerpApi(AsyncClient):
244
298
  If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
245
299
 
246
300
  Args:
301
+ engine: The search engine used.
247
302
  url: The URL to be processed.
248
303
  location: The location to use for the query.
249
304
  marketplaces: The list of marketplaces to compare the URL against.
305
+ excluded_urls: The list of excluded URLs.
250
306
  """
251
307
  # Get marketplace name
252
308
  domain = self._get_domain(url=url)
253
- marketplace_name = self._default_marketplace_name
309
+
310
+ # Select marketplace name based on engine
311
+ marketplace_name = self._engine_marketplace_names[engine]
312
+
254
313
  if marketplaces:
255
314
  try:
256
315
  marketplace_name = next(
@@ -277,9 +336,109 @@ class SerpApi(AsyncClient):
277
336
  )
278
337
  return result
279
338
 
339
+ async def _search_google(
340
+ self,
341
+ search_string: str,
342
+ language: Language,
343
+ location: Location,
344
+ num_results: int,
345
+ marketplaces: List[Host] | None = None,
346
+ excluded_urls: List[Host] | None = None,
347
+ ) -> List[SerpResult]:
348
+ """Performs a google search using SerpApi and returns SerpResults.
349
+
350
+ Args:
351
+ search_string: The search string (with potentially added site: parameters).
352
+ language: The language to use for the query ('hl' parameter).
353
+ location: The location to use for the query ('gl' parameter).
354
+ num_results: Max number of results to return.
355
+ marketplaces: The marketplaces to include in the search.
356
+ excluded_urls: The URLs to exclude from the search.
357
+ """
358
+ engine = SearchEngine.GOOGLE.value
359
+
360
+ # Perform the search
361
+ urls = await self._search(
362
+ engine=engine,
363
+ search_string=search_string,
364
+ language=language,
365
+ location=location,
366
+ num_results=num_results,
367
+ )
368
+
369
+ # Create SerpResult objects from the URLs
370
+ results = [
371
+ self._create_serp_result(
372
+ url=url,
373
+ location=location,
374
+ marketplaces=marketplaces,
375
+ excluded_urls=excluded_urls,
376
+ engine=engine,
377
+ )
378
+ for url in urls
379
+ ]
380
+
381
+ logger.debug(
382
+ f'Produced {len(results)} results from google search with q="{search_string}".'
383
+ )
384
+ return results
385
+
386
+ async def _search_google_shopping(
387
+ self,
388
+ search_string: str,
389
+ language: Language,
390
+ location: Location,
391
+ num_results: int,
392
+ marketplaces: List[Host] | None = None,
393
+ excluded_urls: List[Host] | None = None,
394
+ ) -> List[SerpResult]:
395
+ """Performs a google search using SerpApi and returns SerpResults.
396
+
397
+ Args:
398
+ search_string: The search string (with potentially added site: parameters).
399
+ language: The language to use for the query ('hl' parameter).
400
+ location: The location to use for the query ('gl' parameter).
401
+ num_results: Max number of results to return.
402
+ marketplaces: The marketplaces to include in the search.
403
+ excluded_urls: The URLs to exclude from the search.
404
+ """
405
+ engine = SearchEngine.GOOGLE_SHOPPING.value
406
+
407
+ # Perform the search
408
+ urls = await self._search(
409
+ engine=engine,
410
+ search_string=search_string,
411
+ language=language,
412
+ location=location,
413
+ num_results=num_results,
414
+ )
415
+
416
+ # !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
417
+ # so we might get more results than requested. This is a known issue with SerpAPI
418
+ # and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
419
+ urls = urls[:num_results]
420
+
421
+ # Create SerpResult objects from the URLs
422
+ results = [
423
+ self._create_serp_result(
424
+ url=url,
425
+ location=location,
426
+ marketplaces=marketplaces,
427
+ excluded_urls=excluded_urls,
428
+ engine=engine,
429
+ )
430
+ for url in urls
431
+ ]
432
+
433
+ logger.debug(
434
+ f'Produced {len(results)} results from google shopping search with q="{search_string}".'
435
+ )
436
+ return results
437
+
280
438
  async def apply(
281
439
  self,
282
440
  search_term: str,
441
+ search_engines: List[SearchEngine],
283
442
  language: Language,
284
443
  location: Location,
285
444
  num_results: int,
@@ -305,27 +464,35 @@ class SerpApi(AsyncClient):
305
464
  sites = [dom for host in marketplaces for dom in host.domains]
306
465
  search_string += " site:" + " OR site:".join(s for s in sites)
307
466
 
308
- # Perform the search
309
- urls = await self._search(
310
- search_string=search_string,
311
- language=language,
312
- location=location,
313
- num_results=num_results,
314
- )
467
+ # Initialize the results list
468
+ results: List[SerpResult] = []
315
469
 
316
- # Form the SerpResult objects
317
- results = [
318
- self._create_serp_result(
319
- url=url,
470
+ # Perform the google search
471
+ if SearchEngine.GOOGLE in search_engines:
472
+ ggl_res = await self._search_google(
473
+ search_string=search_string,
474
+ language=language,
320
475
  location=location,
476
+ num_results=num_results,
321
477
  marketplaces=marketplaces,
322
478
  excluded_urls=excluded_urls,
323
479
  )
324
- for url in urls
325
- ]
480
+ results.extend(ggl_res)
481
+
482
+ # Perform the google shopping search
483
+ if SearchEngine.GOOGLE_SHOPPING in search_engines:
484
+ shp_res = await self._search_google_shopping(
485
+ search_string=search_string,
486
+ language=language,
487
+ location=location,
488
+ num_results=num_results,
489
+ marketplaces=marketplaces,
490
+ excluded_urls=excluded_urls,
491
+ )
492
+ results.extend(shp_res)
326
493
 
327
494
  num_non_filtered = len([res for res in results if not res.filtered])
328
495
  logger.info(
329
- f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
496
+ f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
330
497
  )
331
498
  return results
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import logging
3
3
  from typing import List
4
+ from base64 import b64decode
4
5
 
5
6
  import aiohttp
6
7
 
@@ -68,7 +69,8 @@ class ZyteApi(AsyncClient):
68
69
  "metadata": {
69
70
  "probability": float,
70
71
  },
71
- }
72
+ },
73
+ "httpResponseBody": base64
72
74
  }
73
75
  """
74
76
  logger.info(f"Fetching product details by Zyte for URL {url}.")
@@ -192,3 +194,24 @@ class ZyteApi(AsyncClient):
192
194
  }
193
195
  """
194
196
  return float(details.get("product", {}).get("metadata", {}).get("probability"))
197
+
198
+ @staticmethod
199
+ def extract_html(details: dict) -> str | None:
200
+ """Extracts the HTML from the Zyte API response.
201
+
202
+ The input argument is a dictionary of the following structure:
203
+ {
204
+ "httpResponseBody": base64
205
+ }
206
+ """
207
+
208
+ # Get the Base64-encoded content
209
+ encoded = details.get("httpResponseBody")
210
+
211
+ # Decode it into bytes
212
+ if isinstance(encoded, str):
213
+ decoded_bytes = b64decode(encoded)
214
+
215
+ # Convert bytes to string (assuming UTF-8 encoding)
216
+ decoded_string = decoded_bytes.decode("utf-8")
217
+ return decoded_string
fraudcrawler/settings.py CHANGED
@@ -1,4 +1,5 @@
1
1
  from pathlib import Path
2
+ from typing import List
2
3
 
3
4
  # Generic settings
4
5
  MAX_RETRIES = 3
@@ -8,8 +9,8 @@ ROOT_DIR = Path(__file__).parents[1]
8
9
  # Serp settings
9
10
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
10
11
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
11
- SERP_DEFAULT_COUNTRY_CODES = [
12
- ".com",
12
+ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
13
+ # ".com",
13
14
  ]
14
15
 
15
16
  # Enrichment settings
@@ -21,9 +22,8 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
21
22
  # Processor settings
22
23
  PROCESSOR_DEFAULT_MODEL = "gpt-4o"
23
24
  PROCESSOR_DEFAULT_IF_MISSING = -1
24
- PROCESSOR_USER_PROMPT_TEMPLATE = (
25
- "Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
26
- )
25
+ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
26
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
27
27
 
28
28
  # Async settings
29
29
  DEFAULT_N_SERP_WKRS = 10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.10
3
+ Version: 0.4.2
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
16
+ Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
16
17
  Requires-Dist: openai (>=1.68.2,<2.0.0)
17
18
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
19
  Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
@@ -80,7 +81,6 @@ deepness = Deepness(num_results=50)
80
81
  prompts = [
81
82
  Prompt(
82
83
  name="relevance",
83
- context="This organization is interested in medical products and drugs.",
84
84
  system_prompt=(
85
85
  "You are a helpful and intelligent assistant. Your task is to classify any given product "
86
86
  "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
@@ -0,0 +1,20 @@
1
+ fraudcrawler/__init__.py,sha256=yXFdQzlSLUZV4Oh0wkzghvPlICQO5TnpEtIHZaTay_c,717
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=JWjZ3mpX4caQAsWKYqtHrUqHfHr6GXlAaEjxxHV9ODQ,6020
4
+ fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=p1gRtj3jVaFmtwPSKruiOixu3QDuSiHjPKFi0KKsgPk,24591
8
+ fraudcrawler/launch_demo_pipeline.py,sha256=zQxKAekJ56iKQ5-NeM0UMS-1Wd3ui0bpeqkH1nM9A4A,4628
9
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ fraudcrawler/processing/processor.py,sha256=An2orst0YRIav7bFuoDMgjwWz2Z9dyjVUbkNAMXNTTo,3748
11
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
13
+ fraudcrawler/scraping/serp.py,sha256=ix2kCs9Xo694K8vjDL104MDb2Cun1AXfStxMaR-2u7U,17865
14
+ fraudcrawler/scraping/zyte.py,sha256=DUF5pIwpZyQw30qURnFxtp8KYpUgBkrXjM7RaVGH92Q,7005
15
+ fraudcrawler/settings.py,sha256=z63Lc8LnmfG7u0F7CVlGOXMMpr7LtJC0BzXDoA8rN7Q,839
16
+ fraudcrawler-0.4.2.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
17
+ fraudcrawler-0.4.2.dist-info/METADATA,sha256=M1xMdweLHpSbfEceT_5GpcDiLdDHpOHpzQ5w-ZNF4gQ,5931
18
+ fraudcrawler-0.4.2.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
19
+ fraudcrawler-0.4.2.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
20
+ fraudcrawler-0.4.2.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- fraudcrawler/__init__.py,sha256=2EgoTb2jNcQt1NxUV8za0154kb7ZnHZ_KeKgx21rdFs,679
2
- fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=woesbPztEh7tbD0ty9S37JbFrbEC-01H9etmCT2ffnc,4771
4
- fraudcrawler/base/client.py,sha256=GcTUMqLfvweLFdHy6CP9tgxsFQiPkc6KyiLcwLnDiw8,4412
5
- fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
- fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=Gmryv8l8nB1QUwwjLoZGop2mwKqWYQQORT_96_w5ptA,23981
8
- fraudcrawler/launch_demo_pipeline.py,sha256=RIZTtdtZeJPhvSLp1IUjT_nhme_2q6mAGWKoL838E4E,4320
9
- fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- fraudcrawler/processing/processor.py,sha256=IFVKIiNi0QoCAgPFkFtNDgxfhh01iDNUyIBZWACplR8,3993
11
- fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
13
- fraudcrawler/scraping/serp.py,sha256=xBXqBcgO25xtiV3CnTLRuUeKIpnWGaAyDqF7KeGAcks,11750
14
- fraudcrawler/scraping/zyte.py,sha256=ggI4iYG-E_UyiKgUpEFekeUd1giifEfJ_uyFUSJGSLY,6296
15
- fraudcrawler/settings.py,sha256=1SVxjwMLuZd_rr3KkwYoRozTBw2VQU-OJQkgA33k95Q,768
16
- fraudcrawler-0.3.10.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
17
- fraudcrawler-0.3.10.dist-info/METADATA,sha256=Nkr3t_4q_pejrdBFyzbOq9ePlauQwy-ZM_Njr1n6OSk,5966
18
- fraudcrawler-0.3.10.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
19
- fraudcrawler-0.3.10.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
20
- fraudcrawler-0.3.10.dist-info/RECORD,,