fraudcrawler 0.4.0__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show
  1. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/PKG-INFO +2 -2
  2. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/README.md +0 -1
  3. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/__init__.py +4 -1
  4. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/base.py +41 -2
  5. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/client.py +10 -2
  6. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/orchestrator.py +45 -47
  7. fraudcrawler-0.4.3/fraudcrawler/launch_demo_pipeline.py +101 -0
  8. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/processing/processor.py +11 -19
  9. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/serp.py +4 -2
  10. fraudcrawler-0.4.3/fraudcrawler/scraping/url.py +57 -0
  11. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/zyte.py +24 -1
  12. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/settings.py +14 -3
  13. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/pyproject.toml +2 -1
  14. fraudcrawler-0.4.0/fraudcrawler/launch_demo_pipeline.py +0 -100
  15. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/LICENSE +0 -0
  16. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/__init__.py +0 -0
  17. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/google-languages.json +0 -0
  18. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/base/google-locations.json +0 -0
  19. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/processing/__init__.py +0 -0
  20. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/__init__.py +0 -0
  21. {fraudcrawler-0.4.0 → fraudcrawler-0.4.3}/fraudcrawler/scraping/enrich.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.4.0
3
+ Version: 0.4.3
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
16
+ Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
16
17
  Requires-Dist: openai (>=1.68.2,<2.0.0)
17
18
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
19
  Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
@@ -80,7 +81,6 @@ deepness = Deepness(num_results=50)
80
81
  prompts = [
81
82
  Prompt(
82
83
  name="relevance",
83
- context="This organization is interested in medical products and drugs.",
84
84
  system_prompt=(
85
85
  "You are a helpful and intelligent assistant. Your task is to classify any given product "
86
86
  "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
@@ -58,7 +58,6 @@ deepness = Deepness(num_results=50)
58
58
  prompts = [
59
59
  Prompt(
60
60
  name="relevance",
61
- context="This organization is interested in medical products and drugs.",
62
61
  system_prompt=(
63
62
  "You are a helpful and intelligent assistant. Your task is to classify any given product "
64
63
  "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
@@ -1,8 +1,9 @@
1
1
  from fraudcrawler.scraping.serp import SerpApi, SearchEngine
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
+ from fraudcrawler.scraping.url import URLCollector
3
4
  from fraudcrawler.scraping.zyte import ZyteApi
4
5
  from fraudcrawler.processing.processor import Processor
5
- from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
6
+ from fraudcrawler.base.orchestrator import Orchestrator
6
7
  from fraudcrawler.base.client import FraudCrawlerClient
7
8
  from fraudcrawler.base.base import (
8
9
  Deepness,
@@ -11,12 +12,14 @@ from fraudcrawler.base.base import (
11
12
  Language,
12
13
  Location,
13
14
  Prompt,
15
+ ProductItem,
14
16
  )
15
17
 
16
18
  __all__ = [
17
19
  "SerpApi",
18
20
  "SearchEngine",
19
21
  "Enricher",
22
+ "URLCollector",
20
23
  "ZyteApi",
21
24
  "Processor",
22
25
  "Orchestrator",
@@ -2,12 +2,13 @@ import json
2
2
  import logging
3
3
  from pydantic import (
4
4
  BaseModel,
5
+ Field,
5
6
  field_validator,
6
7
  model_validator,
7
8
  )
8
9
  from pydantic_settings import BaseSettings
9
10
  import re
10
- from typing import List
11
+ from typing import List, Dict
11
12
 
12
13
  import aiohttp
13
14
 
@@ -114,12 +115,39 @@ class Deepness(BaseModel):
114
115
  enrichment: Enrichment | None = None
115
116
 
116
117
 
118
+ class ProductItem(BaseModel):
119
+ """Model representing a product item."""
120
+
121
+ # Serp/Enrich parameters
122
+ search_term: str
123
+ search_term_type: str
124
+ url: str
125
+ marketplace_name: str
126
+ domain: str
127
+
128
+ # Zyte parameters
129
+ product_name: str | None = None
130
+ product_price: str | None = None
131
+ product_description: str | None = None
132
+ product_images: List[str] | None = None
133
+ probability: float | None = None
134
+ html: str | None = None
135
+ html_clean: str | None = None
136
+
137
+ # Processor parameters are set dynamic so we must allow extra fields
138
+ classifications: Dict[str, int] = Field(default_factory=dict)
139
+
140
+ # Filtering parameters
141
+ filtered: bool = False
142
+ filtered_at_stage: str | None = None
143
+
144
+
117
145
  class Prompt(BaseModel):
118
146
  """Model for prompts."""
119
147
 
120
148
  name: str
121
- context: str
122
149
  system_prompt: str
150
+ product_item_fields: List[str]
123
151
  allowed_classes: List[int]
124
152
 
125
153
  @field_validator("allowed_classes", mode="before")
@@ -129,6 +157,17 @@ class Prompt(BaseModel):
129
157
  raise ValueError("all values in allowed_classes must be positive integers.")
130
158
  return val
131
159
 
160
+ @field_validator("product_item_fields", mode="before")
161
+ def validate_product_item_fields(cls, val):
162
+ """Ensure all product_item_fields are valid ProductItem attributes."""
163
+ valid_fields = set(ProductItem.model_fields.keys())
164
+ for field in val:
165
+ if field not in valid_fields:
166
+ raise ValueError(
167
+ f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
168
+ )
169
+ return val
170
+
132
171
 
133
172
  class AsyncClient:
134
173
  """Base class for sub-classes using async HTTP requests."""
@@ -9,8 +9,16 @@ from typing import List
9
9
  import pandas as pd
10
10
 
11
11
  from fraudcrawler.settings import ROOT_DIR
12
- from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
13
- from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
12
+ from fraudcrawler.base.base import (
13
+ Setup,
14
+ Language,
15
+ Location,
16
+ Deepness,
17
+ Host,
18
+ Prompt,
19
+ ProductItem,
20
+ )
21
+ from fraudcrawler.base.orchestrator import Orchestrator
14
22
  from fraudcrawler.scraping.serp import SearchEngine
15
23
 
16
24
  logger = logging.getLogger(__name__)
@@ -1,12 +1,14 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import asyncio
3
3
  import logging
4
- from pydantic import BaseModel, Field
5
- from typing import Dict, List, Set, cast
4
+ from typing import Dict, List, cast
5
+
6
+ from bs4 import BeautifulSoup
6
7
 
7
8
  from fraudcrawler.settings import (
8
9
  PROCESSOR_DEFAULT_MODEL,
9
10
  PROCESSOR_DEFAULT_IF_MISSING,
11
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
10
12
  MAX_RETRIES,
11
13
  RETRY_DELAY,
12
14
  )
@@ -15,37 +17,26 @@ from fraudcrawler.settings import (
15
17
  DEFAULT_N_ZYTE_WKRS,
16
18
  DEFAULT_N_PROC_WKRS,
17
19
  )
18
- from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
19
- from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
20
+ from fraudcrawler.base.base import (
21
+ Deepness,
22
+ Host,
23
+ Language,
24
+ Location,
25
+ Prompt,
26
+ ProductItem,
27
+ )
28
+ from fraudcrawler import (
29
+ SerpApi,
30
+ SearchEngine,
31
+ Enricher,
32
+ URLCollector,
33
+ ZyteApi,
34
+ Processor,
35
+ )
20
36
 
21
37
  logger = logging.getLogger(__name__)
22
38
 
23
39
 
24
- class ProductItem(BaseModel):
25
- """Model representing a product item."""
26
-
27
- # Serp/Enrich parameters
28
- search_term: str
29
- search_term_type: str
30
- url: str
31
- marketplace_name: str
32
- domain: str
33
-
34
- # Zyte parameters
35
- product_name: str | None = None
36
- product_price: str | None = None
37
- product_description: str | None = None
38
- product_images: List[str] | None = None
39
- probability: float | None = None
40
-
41
- # Processor parameters are set dynamic so we must allow extra fields
42
- classifications: Dict[str, int] = Field(default_factory=dict)
43
-
44
- # Filtering parameters
45
- filtered: bool = False
46
- filtered_at_stage: str | None = None
47
-
48
-
49
40
  class Orchestrator(ABC):
50
41
  """Abstract base class for orchestrating the different actors (crawling, processing).
51
42
 
@@ -92,15 +83,12 @@ class Orchestrator(ABC):
92
83
  n_zyte_wkrs: Number of async workers for zyte (optional).
93
84
  n_proc_wkrs: Number of async workers for the processor (optional).
94
85
  """
95
- # Setup the variables
96
- self._collected_urls_current_run: Set[str] = set()
97
- self._collected_urls_previous_runs: Set[str] = set()
98
-
99
86
  # Setup the clients
100
87
  self._serpapi = SerpApi(
101
88
  api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
102
89
  )
103
90
  self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
91
+ self._url_collector = URLCollector()
104
92
  self._zyteapi = ZyteApi(
105
93
  api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
106
94
  )
@@ -173,16 +161,18 @@ class Orchestrator(ABC):
173
161
  break
174
162
 
175
163
  if not product.filtered:
176
- url = product.url
164
+ # Clean the URL by removing tracking parameters
165
+ url = self._url_collector.remove_tracking_parameters(product.url)
166
+ product.url = url
177
167
 
178
- if url in self._collected_urls_current_run:
168
+ if url in self._url_collector.collected_currently:
179
169
  # deduplicate on current run
180
170
  product.filtered = True
181
171
  product.filtered_at_stage = (
182
172
  "URL collection (current run deduplication)"
183
173
  )
184
174
  logger.debug(f"URL {url} already collected in current run")
185
- elif url in self._collected_urls_previous_runs:
175
+ elif url in self._url_collector.collected_previously:
186
176
  # deduplicate on previous runs coming from a db
187
177
  product.filtered = True
188
178
  product.filtered_at_stage = (
@@ -190,7 +180,7 @@ class Orchestrator(ABC):
190
180
  )
191
181
  logger.debug(f"URL {url} as already collected in previous run")
192
182
  else:
193
- self._collected_urls_current_run.add(url)
183
+ self._url_collector.collected_currently.add(url)
194
184
 
195
185
  await queue_out.put(product)
196
186
  queue_in.task_done()
@@ -231,15 +221,16 @@ class Orchestrator(ABC):
231
221
  product.probability = self._zyteapi.extract_probability(
232
222
  details=details
233
223
  )
234
-
224
+ product.html = self._zyteapi.extract_html(details=details)
225
+ if product.html:
226
+ soup = BeautifulSoup(product.html, "html.parser")
227
+ product.html_clean = soup.get_text(separator=" ", strip=True)
235
228
  # Filter the product based on the probability threshold
236
229
  if not self._zyteapi.keep_product(details=details):
237
230
  product.filtered = True
238
231
  product.filtered_at_stage = "Zyte probability threshold"
239
-
240
232
  except Exception as e:
241
233
  logger.warning(f"Error executing Zyte API search: {e}.")
242
-
243
234
  await queue_out.put(product)
244
235
  queue_in.task_done()
245
236
 
@@ -269,19 +260,26 @@ class Orchestrator(ABC):
269
260
  if not product.filtered:
270
261
  try:
271
262
  url = product.url
272
- name = product.product_name
273
- description = product.product_description
274
-
275
263
  # Run all the configured prompts
276
264
  for prompt in prompts:
265
+ # Dynamically build product_details string
266
+ details = []
267
+ for field in prompt.product_item_fields:
268
+ value = getattr(product, field, None)
269
+ if value is not None:
270
+ details.append(
271
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
272
+ field_name=field, field_value=value
273
+ )
274
+ )
275
+ product_details = "\n\n".join(details)
277
276
  logger.debug(
278
- f"Classify product {name} with prompt {prompt.name}"
277
+ f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
279
278
  )
280
279
  classification = await self._processor.classify(
281
280
  prompt=prompt,
282
281
  url=url,
283
- name=name,
284
- description=description,
282
+ product_details=product_details,
285
283
  )
286
284
  product.classifications[prompt.name] = classification
287
285
  except Exception as e:
@@ -489,7 +487,7 @@ class Orchestrator(ABC):
489
487
  # INITIAL SETUP
490
488
  # ---------------------------
491
489
  if previously_collected_urls:
492
- self._collected_urls_previous_runs = set(self._collected_urls_current_run)
490
+ self._url_collector.collected_previously = set(previously_collected_urls)
493
491
 
494
492
  # Setup the async framework
495
493
  n_terms_max = 1 + (
@@ -0,0 +1,101 @@
1
+ import logging
2
+
3
+ from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
4
+
5
+ LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
6
+ LOG_LVL = "INFO"
7
+ DATE_FMT = "%Y-%m-%d %H:%M:%S"
8
+ logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
9
+
10
+
11
+ def main():
12
+ # Setup the client
13
+ client = FraudCrawlerClient()
14
+
15
+ # Setup the search
16
+ search_term = "Medion Kühlbox MD 37454"
17
+ language = Language(name="German")
18
+ location = Location(name="Switzerland")
19
+ deepness = Deepness(num_results=10)
20
+ prompts = [
21
+ Prompt(
22
+ name="availability",
23
+ system_prompt=(
24
+ "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
25
+ "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
26
+ "You must consider all aspects of the given context and make a binary decision accordingly. "
27
+ "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
28
+ "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
29
+ "Respond only with the number 1 or 0."
30
+ ),
31
+ product_item_fields=["product_name", "html_clean"],
32
+ allowed_classes=[0, 1],
33
+ ),
34
+ # Prompt(
35
+ # name="seriousness",
36
+ # system_prompt=(
37
+ # "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
38
+ # "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
39
+ # " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
40
+ # "within an online shop or marketplace.\n"
41
+ # " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
42
+ # " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
43
+ # "exact product itself, classify as 0.\n"
44
+ # " - Advertisements: Promotional content that doesn't directly sell a product.\n"
45
+ # " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
46
+ # " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
47
+ # "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
48
+ # ),
49
+ # product_item_fields=["product_name", "product_description"],
50
+ # allowed_classes=[0, 1],
51
+ # ),
52
+ ]
53
+ # # Optional: Add tern ENRICHEMENT
54
+ # from fraudcrawler import Enrichment
55
+
56
+ # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
57
+
58
+ # # Optional: Add MARKETPLACES and EXCLUDED_URLS
59
+ # from fraudcrawler import Host
60
+
61
+ # marketplaces = [
62
+ # Host(name="International", domains="zavamed.com,apomeds.com"),
63
+ # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
64
+ # ]
65
+ # excluded_urls = [
66
+ # Host(name="Digitec", domains="digitec.ch"),
67
+ # Host(name="Brack", domains="brack.ch"),
68
+ # ]
69
+
70
+ # Execute the pipeline
71
+ client.execute(
72
+ search_term=search_term,
73
+ language=language,
74
+ location=location,
75
+ deepness=deepness,
76
+ prompts=prompts,
77
+ # marketplaces=marketplaces,
78
+ # excluded_urls=excluded_urls,
79
+ )
80
+
81
+ # Show results
82
+ print()
83
+ title = "Available results"
84
+ print(title)
85
+ print("=" * len(title))
86
+ client.print_available_results()
87
+ print()
88
+ title = f'Results for "{search_term.upper()}"'
89
+ print(title)
90
+ print("=" * len(title))
91
+ df = client.load_results()
92
+ print(f"Number of products found: {len(df)}")
93
+ print()
94
+ n_head = 10
95
+ print(f"First {n_head} products are:")
96
+ print(df.head(n=n_head))
97
+ print()
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
@@ -52,42 +52,34 @@ class Processor:
52
52
  raise ValueError("Empty response from OpenAI API")
53
53
  return content
54
54
 
55
- async def classify(
56
- self, prompt: Prompt, url: str, name: str | None, description: str | None
57
- ) -> int:
58
- """A generic classification method that classified a product based on a prompt object.
55
+ async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
56
+ """A generic classification method that classifies a product based on a prompt object.
59
57
 
60
58
  Args:
61
- prompt: A dictionary with keys "system_prompt", "user_prompt", etc.
59
+ prompt: A dictionary with keys "system_prompt", etc.
62
60
  url: Product URL (often used in the user_prompt).
63
- name: Product name (often used in the user_prompt).
64
- description: Product description (often used in the user_prompt).
61
+ product_details: String with product details, formatted per prompt.product_item_fields.
65
62
 
66
63
  Note:
67
64
  This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
68
- - 'name' or 'description' is None
65
+ - product_details is empty
69
66
  - an error occurs during the API call
70
67
  - if the response isn't in allowed_classes.
71
68
  """
72
69
  # If required fields are missing, return the prompt's default fallback if provided.
73
- if name is None or description is None:
74
- logger.warning(
75
- f"Missing required fields for classification: name='{name}', description='{description}'"
76
- )
70
+ if not product_details:
71
+ logger.warning("Missing required product_details for classification.")
77
72
  return self._default_if_missing
78
73
 
79
74
  # Substitute placeholders in user_prompt with the relevant arguments
80
75
  user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
81
- context=prompt.context,
82
- url=url,
83
- name=name,
84
- description=description,
76
+ product_details=product_details,
85
77
  )
86
78
 
87
79
  # Call the OpenAI API
88
80
  try:
89
81
  logger.debug(
90
- f'Calling OpenAI API for classification (name="{name}", prompt="{prompt.name}")'
82
+ f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
91
83
  )
92
84
  content = await self._call_openai_api(
93
85
  system_prompt=prompt.system_prompt,
@@ -104,12 +96,12 @@ class Processor:
104
96
  return self._default_if_missing
105
97
 
106
98
  logger.info(
107
- f'Classification for "{name}" (prompt={prompt.name}): {classification}'
99
+ f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
108
100
  )
109
101
  return classification
110
102
 
111
103
  except Exception as e:
112
104
  logger.error(
113
- f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
105
+ f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
114
106
  )
115
107
  return self._default_if_missing
@@ -4,16 +4,17 @@ import logging
4
4
  from pydantic import BaseModel
5
5
  from typing import List
6
6
  from urllib.parse import urlparse
7
+ import re
7
8
 
8
9
  from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
9
10
  from fraudcrawler.base.base import Host, Language, Location, AsyncClient
10
- import re
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  class SerpResult(BaseModel):
16
16
  """Model for a single search result from SerpApi."""
17
+
17
18
  url: str
18
19
  domain: str
19
20
  marketplace_name: str
@@ -23,6 +24,7 @@ class SerpResult(BaseModel):
23
24
 
24
25
  class SearchEngine(Enum):
25
26
  """Enum for the supported search engines."""
27
+
26
28
  GOOGLE = "google"
27
29
  GOOGLE_SHOPPING = "google_shopping"
28
30
 
@@ -33,7 +35,7 @@ class SerpApi(AsyncClient):
33
35
  _endpoint = "https://serpapi.com/search"
34
36
  _engine_marketplace_names = {
35
37
  SearchEngine.GOOGLE.value: "Google",
36
- SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping"
38
+ SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
37
39
  }
38
40
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
39
41
 
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from typing import List, Set, Tuple
3
+ from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
+
5
+ from fraudcrawler.settings import KNOWN_TRACKERS
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class URLCollector:
11
+ """A class to collect and de-duplicate URLs."""
12
+
13
+ def __init__(self):
14
+ self.collected_currently: Set[str] = set()
15
+ self.collected_previously: Set[str] = set()
16
+
17
+ @staticmethod
18
+ def remove_tracking_parameters(url: str) -> str:
19
+ """Remove tracking parameters from URLs.
20
+
21
+ Args:
22
+ url: The URL to clean.
23
+
24
+ Returns:
25
+ The cleaned URL without tracking parameters.
26
+ """
27
+ logging.debug(f"Removing tracking parameters from URL: {url}")
28
+
29
+ # Parse the url
30
+ parsed_url = urlparse(url)
31
+
32
+ # Parse query parameters
33
+ queries: List[Tuple[str, str]] = parse_qsl(
34
+ parsed_url.query, keep_blank_values=True
35
+ )
36
+ remove_all = url.startswith(
37
+ "https://www.ebay"
38
+ ) # eBay URLs have all query parameters as tracking parameters
39
+ if remove_all:
40
+ filtered_queries = []
41
+ else:
42
+ filtered_queries = [
43
+ q
44
+ for q in queries
45
+ if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
46
+ ]
47
+
48
+ # Rebuild the URL without tracking parameters
49
+ clean_url = ParseResult(
50
+ scheme=parsed_url.scheme,
51
+ netloc=parsed_url.netloc,
52
+ path=parsed_url.path,
53
+ params=parsed_url.params,
54
+ query=urlencode(filtered_queries, quote_via=quote),
55
+ fragment=parsed_url.fragment,
56
+ )
57
+ return urlunparse(clean_url)
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import logging
3
3
  from typing import List
4
+ from base64 import b64decode
4
5
 
5
6
  import aiohttp
6
7
 
@@ -68,7 +69,8 @@ class ZyteApi(AsyncClient):
68
69
  "metadata": {
69
70
  "probability": float,
70
71
  },
71
- }
72
+ },
73
+ "httpResponseBody": base64
72
74
  }
73
75
  """
74
76
  logger.info(f"Fetching product details by Zyte for URL {url}.")
@@ -192,3 +194,24 @@ class ZyteApi(AsyncClient):
192
194
  }
193
195
  """
194
196
  return float(details.get("product", {}).get("metadata", {}).get("probability"))
197
+
198
+ @staticmethod
199
+ def extract_html(details: dict) -> str | None:
200
+ """Extracts the HTML from the Zyte API response.
201
+
202
+ The input argument is a dictionary of the following structure:
203
+ {
204
+ "httpResponseBody": base64
205
+ }
206
+ """
207
+
208
+ # Get the Base64-encoded content
209
+ encoded = details.get("httpResponseBody")
210
+
211
+ # Decode it into bytes
212
+ if isinstance(encoded, str):
213
+ decoded_bytes = b64decode(encoded)
214
+
215
+ # Convert bytes to string (assuming UTF-8 encoding)
216
+ decoded_string = decoded_bytes.decode("utf-8")
217
+ return decoded_string
@@ -13,6 +13,18 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
13
13
  # ".com",
14
14
  ]
15
15
 
16
+ # URL De-duplication settings
17
+ KNOWN_TRACKERS = [
18
+ "srsltid",
19
+ "utm_source",
20
+ "utm_medium",
21
+ "utm_campaign",
22
+ "utm_term",
23
+ "utm_content",
24
+ "ar",
25
+ "ps",
26
+ ]
27
+
16
28
  # Enrichment settings
17
29
  ENRICHMENT_DEFAULT_LIMIT = 10
18
30
 
@@ -22,9 +34,8 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
22
34
  # Processor settings
23
35
  PROCESSOR_DEFAULT_MODEL = "gpt-4o"
24
36
  PROCESSOR_DEFAULT_IF_MISSING = -1
25
- PROCESSOR_USER_PROMPT_TEMPLATE = (
26
- "Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
27
- )
37
+ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
38
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
28
39
 
29
40
  # Async settings
30
41
  DEFAULT_N_SERP_WKRS = 10
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.4.0"
7
+ version = "0.4.3"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
@@ -25,6 +25,7 @@ pandas = "^2.2.3"
25
25
  aiohttp = "^3.11.14"
26
26
  pydantic-settings = "^2.8.1"
27
27
  openai = "^1.68.2"
28
+ beautifulsoup4 = "^4.13.4"
28
29
 
29
30
  [tool.poetry.group.dev.dependencies]
30
31
  pytest-cov = "^6.0.0"
@@ -1,100 +0,0 @@
1
- import logging
2
-
3
- from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
4
-
5
- LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
6
- LOG_LVL = "INFO"
7
- DATE_FMT = "%Y-%m-%d %H:%M:%S"
8
- logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
9
-
10
-
11
- def main():
12
- # Setup the client
13
- client = FraudCrawlerClient()
14
-
15
- # Setup the search
16
- search_term = "Kühlschrank"
17
- language = Language(name="German")
18
- location = Location(name="Switzerland")
19
- deepness = Deepness(num_results=20)
20
- prompts = [
21
- Prompt(
22
- name="relevance",
23
- context="This organization is interested in checking the energy efficiency of certain devices.",
24
- system_prompt=(
25
- "You are a helpful and intelligent assistant. Your task is to classify any given product "
26
- "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
27
- "You must consider all aspects of the given context and make a binary decision accordingly. "
28
- "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
29
- "Respond only with the number 1 or 0."
30
- ),
31
- allowed_classes=[0, 1],
32
- ),
33
- Prompt(
34
- name="seriousness",
35
- context="This organization is interested in checking the energy efficiency of certain devices.",
36
- system_prompt=(
37
- "You are an intelligent and discerning assistant. Your task is to classify each item as either "
38
- "a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
39
- " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
40
- "within an online shop or marketplace.\n"
41
- " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
42
- " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
43
- "exact product itself, classify as 0.\n"
44
- " - Advertisements: Promotional content that doesn't directly sell a product.\n"
45
- " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
46
- " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
47
- "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
48
- ),
49
- allowed_classes=[0, 1],
50
- ),
51
- ]
52
- # # Optional: Add tern ENRICHEMENT
53
- # from fraudcrawler import Enrichment
54
-
55
- # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
56
-
57
- # # Optional: Add MARKETPLACES and EXCLUDED_URLS
58
- # from fraudcrawler import Host
59
-
60
- # marketplaces = [
61
- # Host(name="International", domains="zavamed.com,apomeds.com"),
62
- # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
63
- # ]
64
- # excluded_urls = [
65
- # Host(name="Digitec", domains="digitec.ch"),
66
- # Host(name="Brack", domains="brack.ch"),
67
- # ]
68
-
69
- # Execute the pipeline
70
- client.execute(
71
- search_term=search_term,
72
- language=language,
73
- location=location,
74
- deepness=deepness,
75
- prompts=prompts,
76
- # marketplaces=marketplaces,
77
- # excluded_urls=excluded_urls,
78
- )
79
-
80
- # Show results
81
- print()
82
- title = "Available results"
83
- print(title)
84
- print("=" * len(title))
85
- client.print_available_results()
86
- print()
87
- title = f'Results for "{search_term.upper()}"'
88
- print(title)
89
- print("=" * len(title))
90
- df = client.load_results()
91
- print(f"Number of products found: {len(df)}")
92
- print()
93
- n_head = 10
94
- print(f"First {n_head} products are:")
95
- print(df.head(n=n_head))
96
- print()
97
-
98
-
99
- if __name__ == "__main__":
100
- main()
File without changes