fraudcrawler 0.4.2__tar.gz → 0.4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

Files changed (21) hide show
  1. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/PKG-INFO +2 -3
  2. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/__init__.py +2 -0
  3. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/base.py +11 -0
  4. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/orchestrator.py +25 -12
  5. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/launch_demo_pipeline.py +1 -1
  6. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/processing/processor.py +34 -14
  7. fraudcrawler-0.4.5/fraudcrawler/scraping/url.py +57 -0
  8. fraudcrawler-0.4.5/fraudcrawler/settings.py +73 -0
  9. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/pyproject.toml +1 -1
  10. fraudcrawler-0.4.2/fraudcrawler/settings.py +0 -31
  11. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/LICENSE +0 -0
  12. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/README.md +0 -0
  13. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/__init__.py +0 -0
  14. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/client.py +0 -0
  15. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/google-languages.json +0 -0
  16. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/google-locations.json +0 -0
  17. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/processing/__init__.py +0 -0
  18. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/__init__.py +0 -0
  19. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/enrich.py +0 -0
  20. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/serp.py +1 -1
  21. {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/zyte.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.4.2
3
+ Version: 0.4.5
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -11,7 +11,6 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
14
  Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
16
15
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
17
16
  Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -1,5 +1,6 @@
1
1
  from fraudcrawler.scraping.serp import SerpApi, SearchEngine
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
+ from fraudcrawler.scraping.url import URLCollector
3
4
  from fraudcrawler.scraping.zyte import ZyteApi
4
5
  from fraudcrawler.processing.processor import Processor
5
6
  from fraudcrawler.base.orchestrator import Orchestrator
@@ -18,6 +19,7 @@ __all__ = [
18
19
  "SerpApi",
19
20
  "SearchEngine",
20
21
  "Enricher",
22
+ "URLCollector",
21
23
  "ZyteApi",
22
24
  "Processor",
23
25
  "Orchestrator",
@@ -63,6 +63,14 @@ class Host(BaseModel):
63
63
  return [cls._normalize_domain(dom.strip()) for dom in val]
64
64
 
65
65
 
66
+ class ClassificationResult(BaseModel):
67
+ """Model for classification results."""
68
+
69
+ result: int
70
+ input_tokens: int
71
+ output_tokens: int
72
+
73
+
66
74
  class Location(BaseModel):
67
75
  """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
68
76
 
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
137
145
  # Processor parameters are set dynamic so we must allow extra fields
138
146
  classifications: Dict[str, int] = Field(default_factory=dict)
139
147
 
148
+ # Usage parameters
149
+ usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
150
+
140
151
  # Filtering parameters
141
152
  filtered: bool = False
142
153
  filtered_at_stage: str | None = None
@@ -1,7 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import asyncio
3
3
  import logging
4
- from typing import Dict, List, Set, cast
4
+ from typing import Dict, List, cast
5
+
5
6
  from bs4 import BeautifulSoup
6
7
 
7
8
  from fraudcrawler.settings import (
@@ -24,7 +25,14 @@ from fraudcrawler.base.base import (
24
25
  Prompt,
25
26
  ProductItem,
26
27
  )
27
- from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
28
+ from fraudcrawler import (
29
+ SerpApi,
30
+ SearchEngine,
31
+ Enricher,
32
+ URLCollector,
33
+ ZyteApi,
34
+ Processor,
35
+ )
28
36
 
29
37
  logger = logging.getLogger(__name__)
30
38
 
@@ -75,15 +83,12 @@ class Orchestrator(ABC):
75
83
  n_zyte_wkrs: Number of async workers for zyte (optional).
76
84
  n_proc_wkrs: Number of async workers for the processor (optional).
77
85
  """
78
- # Setup the variables
79
- self._collected_urls_current_run: Set[str] = set()
80
- self._collected_urls_previous_runs: Set[str] = set()
81
-
82
86
  # Setup the clients
83
87
  self._serpapi = SerpApi(
84
88
  api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
85
89
  )
86
90
  self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
91
+ self._url_collector = URLCollector()
87
92
  self._zyteapi = ZyteApi(
88
93
  api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
89
94
  )
@@ -156,16 +161,18 @@ class Orchestrator(ABC):
156
161
  break
157
162
 
158
163
  if not product.filtered:
159
- url = product.url
164
+ # Clean the URL by removing tracking parameters
165
+ url = self._url_collector.remove_tracking_parameters(product.url)
166
+ product.url = url
160
167
 
161
- if url in self._collected_urls_current_run:
168
+ if url in self._url_collector.collected_currently:
162
169
  # deduplicate on current run
163
170
  product.filtered = True
164
171
  product.filtered_at_stage = (
165
172
  "URL collection (current run deduplication)"
166
173
  )
167
174
  logger.debug(f"URL {url} already collected in current run")
168
- elif url in self._collected_urls_previous_runs:
175
+ elif url in self._url_collector.collected_previously:
169
176
  # deduplicate on previous runs coming from a db
170
177
  product.filtered = True
171
178
  product.filtered_at_stage = (
@@ -173,7 +180,7 @@ class Orchestrator(ABC):
173
180
  )
174
181
  logger.debug(f"URL {url} as already collected in previous run")
175
182
  else:
176
- self._collected_urls_current_run.add(url)
183
+ self._url_collector.collected_currently.add(url)
177
184
 
178
185
  await queue_out.put(product)
179
186
  queue_in.task_done()
@@ -274,7 +281,13 @@ class Orchestrator(ABC):
274
281
  url=url,
275
282
  product_details=product_details,
276
283
  )
277
- product.classifications[prompt.name] = classification
284
+ product.classifications[prompt.name] = int(
285
+ classification.result
286
+ )
287
+ product.usage[prompt.name] = {
288
+ "input_tokens": classification.input_tokens,
289
+ "output_tokens": classification.output_tokens,
290
+ }
278
291
  except Exception as e:
279
292
  logger.warning(f"Error processing product: {e}.")
280
293
 
@@ -480,7 +493,7 @@ class Orchestrator(ABC):
480
493
  # INITIAL SETUP
481
494
  # ---------------------------
482
495
  if previously_collected_urls:
483
- self._collected_urls_previous_runs = set(self._collected_urls_current_run)
496
+ self._url_collector.collected_previously = set(previously_collected_urls)
484
497
 
485
498
  # Setup the async framework
486
499
  n_terms_max = 1 + (
@@ -13,7 +13,7 @@ def main():
13
13
  client = FraudCrawlerClient()
14
14
 
15
15
  # Setup the search
16
- search_term = "Kühlschrank"
16
+ search_term = "Medion Kühlbox MD 37454"
17
17
  language = Language(name="German")
18
18
  location = Location(name="Switzerland")
19
19
  deepness = Deepness(num_results=10)
@@ -2,10 +2,11 @@ import logging
2
2
 
3
3
  from openai import AsyncOpenAI
4
4
 
5
- from fraudcrawler.base.base import Prompt
5
+ from fraudcrawler.base.base import Prompt, ClassificationResult
6
6
  from fraudcrawler.settings import (
7
7
  PROCESSOR_USER_PROMPT_TEMPLATE,
8
8
  PROCESSOR_DEFAULT_IF_MISSING,
9
+ PROCESSOR_EMPTY_TOKEN_COUNT,
9
10
  )
10
11
 
11
12
 
@@ -20,6 +21,7 @@ class Processor:
20
21
  api_key: str,
21
22
  model: str,
22
23
  default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
24
+ empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
23
25
  ):
24
26
  """Initializes the Processor.
25
27
 
@@ -27,17 +29,22 @@ class Processor:
27
29
  api_key: The OpenAI API key.
28
30
  model: The OpenAI model to use.
29
31
  default_if_missing: The default classification to return if error occurs.
32
+ empty_token_count: The default value to return as tokensif the classification is empty.
30
33
  """
31
34
  self._client = AsyncOpenAI(api_key=api_key)
32
35
  self._model = model
33
- self._default_if_missing = default_if_missing
36
+ self._error_response = ClassificationResult(
37
+ result=default_if_missing,
38
+ input_tokens=empty_token_count,
39
+ output_tokens=empty_token_count,
40
+ )
34
41
 
35
42
  async def _call_openai_api(
36
43
  self,
37
44
  system_prompt: str,
38
45
  user_prompt: str,
39
46
  **kwargs,
40
- ) -> str:
47
+ ) -> ClassificationResult:
41
48
  """Calls the OpenAI API with the given user prompt."""
42
49
  response = await self._client.chat.completions.create(
43
50
  model=self._model,
@@ -50,10 +57,24 @@ class Processor:
50
57
  content = response.choices[0].message.content
51
58
  if not content:
52
59
  raise ValueError("Empty response from OpenAI API")
53
- return content
54
60
 
55
- async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
56
- """A generic classification method that classifies a product based on a prompt object.
61
+ # Convert the content to an integer
62
+ content = int(content.strip())
63
+
64
+ # For tracking consumption we alre return the tokens used
65
+ classification = ClassificationResult(
66
+ result=content,
67
+ input_tokens=response.usage.prompt_tokens,
68
+ output_tokens=response.usage.completion_tokens,
69
+ )
70
+
71
+ return classification
72
+
73
+ async def classify(
74
+ self, prompt: Prompt, url: str, product_details: str
75
+ ) -> ClassificationResult:
76
+ """A generic classification method that classifies a product based on a prompt object and returns
77
+ the classification, input tokens, and output tokens.
57
78
 
58
79
  Args:
59
80
  prompt: A dictionary with keys "system_prompt", etc.
@@ -69,7 +90,7 @@ class Processor:
69
90
  # If required fields are missing, return the prompt's default fallback if provided.
70
91
  if not product_details:
71
92
  logger.warning("Missing required product_details for classification.")
72
- return self._default_if_missing
93
+ return self._error_response
73
94
 
74
95
  # Substitute placeholders in user_prompt with the relevant arguments
75
96
  user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
@@ -81,22 +102,21 @@ class Processor:
81
102
  logger.debug(
82
103
  f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
83
104
  )
84
- content = await self._call_openai_api(
105
+ classification = await self._call_openai_api(
85
106
  system_prompt=prompt.system_prompt,
86
107
  user_prompt=user_prompt,
87
108
  max_tokens=1,
88
109
  )
89
- classification = int(content.strip())
90
110
 
91
111
  # Enforce that the classification is in the allowed classes
92
- if classification not in prompt.allowed_classes:
112
+ if classification.result not in prompt.allowed_classes:
93
113
  logger.warning(
94
- f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
114
+ f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
95
115
  )
96
- return self._default_if_missing
116
+ return self._error_response
97
117
 
98
118
  logger.info(
99
- f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
119
+ f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
100
120
  )
101
121
  return classification
102
122
 
@@ -104,4 +124,4 @@ class Processor:
104
124
  logger.error(
105
125
  f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
106
126
  )
107
- return self._default_if_missing
127
+ return self._error_response
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from typing import List, Set, Tuple
3
+ from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
+
5
+ from fraudcrawler.settings import KNOWN_TRACKERS
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class URLCollector:
11
+ """A class to collect and de-duplicate URLs."""
12
+
13
+ def __init__(self):
14
+ self.collected_currently: Set[str] = set()
15
+ self.collected_previously: Set[str] = set()
16
+
17
+ @staticmethod
18
+ def remove_tracking_parameters(url: str) -> str:
19
+ """Remove tracking parameters from URLs.
20
+
21
+ Args:
22
+ url: The URL to clean.
23
+
24
+ Returns:
25
+ The cleaned URL without tracking parameters.
26
+ """
27
+ logging.debug(f"Removing tracking parameters from URL: {url}")
28
+
29
+ # Parse the url
30
+ parsed_url = urlparse(url)
31
+
32
+ # Parse query parameters
33
+ queries: List[Tuple[str, str]] = parse_qsl(
34
+ parsed_url.query, keep_blank_values=True
35
+ )
36
+ remove_all = url.startswith(
37
+ "https://www.ebay"
38
+ ) # eBay URLs have all query parameters as tracking parameters
39
+ if remove_all:
40
+ filtered_queries = []
41
+ else:
42
+ filtered_queries = [
43
+ q
44
+ for q in queries
45
+ if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
46
+ ]
47
+
48
+ # Rebuild the URL without tracking parameters
49
+ clean_url = ParseResult(
50
+ scheme=parsed_url.scheme,
51
+ netloc=parsed_url.netloc,
52
+ path=parsed_url.path,
53
+ params=parsed_url.params,
54
+ query=urlencode(filtered_queries, quote_via=quote),
55
+ fragment=parsed_url.fragment,
56
+ )
57
+ return urlunparse(clean_url)
@@ -0,0 +1,73 @@
1
+ from pathlib import Path
2
+ from typing import List
3
+
4
+ # Generic settings
5
+ MAX_RETRIES = 3
6
+ RETRY_DELAY = 2
7
+ ROOT_DIR = Path(__file__).parents[1]
8
+
9
+ # Serp settings
10
+ GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
11
+ GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
12
+ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
13
+ # ".com",
14
+ ]
15
+
16
+ # URL De-duplication settings
17
+ KNOWN_TRACKERS = [
18
+ "srsltid", # Search result click ID (used by some search engines)
19
+ "utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
20
+ "utm_medium", # UTM: Medium such as CPC, email, social
21
+ "utm_campaign", # UTM: Campaign name (e.g., summer_sale)
22
+ "utm_term", # UTM: Keyword term (used in paid search)
23
+ "utm_content", # UTM: Used to differentiate similar links or ads
24
+ "ar", # Often used for ad region or targeting info
25
+ "ps", # Could refer to promotion source or partner segment
26
+ "gclid", # Google Ads click ID (auto-tagging)
27
+ "gclsrc", # Source of the GCLID (e.g., ads, search)
28
+ "sku", # Product SKU identifier, often used in ecommerce links
29
+ "ref", # Referrer username or source (e.g., GitHub ref links)
30
+ "referral", # Alternate form of referrer, often human-readable
31
+ "aff_id", # Affiliate identifier (ID-based)
32
+ "aff", # Short form for affiliate tag
33
+ "affiliate", # Affiliate tracking parameter (human-readable)
34
+ "partner", # Indicates marketing or distribution partner
35
+ "fbclid", # Facebook Click Identifier
36
+ "msclkid", # Microsoft/Bing Ads click identifier
37
+ "twclid", # Twitter Ads click identifier
38
+ "variant", # A/B test variant (used to test versions of pages)
39
+ "session_id", # Session tracking ID, should not persist across URLs
40
+ "track", # Generic flag used to enable/disable tracking
41
+ "cid", # Campaign ID (used in ads or emails)
42
+ "campaignid", # Alternate or long-form campaign ID
43
+ "adgroup", # Ad group identifier for campaigns
44
+ "bannerid", # Specific banner ad ID (for display ad tracking)
45
+ "token", # Often used to identify users or temporary sessions
46
+ "tag", # Affiliate or marketing tag (used for tracking)
47
+ "hash", # Generic hash identifier, often for state or cache
48
+ "user", # User ID or identifier passed in URL (should be avoided)
49
+ "src", # Generic source indicator, less formal than `utm_source`
50
+ "selsort", # Sorting parameter for search results
51
+ "shid", # Shop ID (used in ecommerce)
52
+ "shoparea", # Shop area (used in ecommerce)
53
+ "shopid", # Shop ID (used in ecommerce)
54
+ "shoparea", # Shop area (used in ecommerce)
55
+ ]
56
+
57
+ # Enrichment settings
58
+ ENRICHMENT_DEFAULT_LIMIT = 10
59
+
60
+ # Zyte settings
61
+ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
62
+
63
+ # Processor settings
64
+ PROCESSOR_DEFAULT_MODEL = "gpt-4o"
65
+ PROCESSOR_DEFAULT_IF_MISSING = -1
66
+ PROCESSOR_EMPTY_TOKEN_COUNT = -1
67
+ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
68
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
69
+
70
+ # Async settings
71
+ DEFAULT_N_SERP_WKRS = 10
72
+ DEFAULT_N_ZYTE_WKRS = 10
73
+ DEFAULT_N_PROC_WKRS = 10
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "fraudcrawler"
7
- version = "0.4.2"
7
+ version = "0.4.5"
8
8
  description = "Intelligent Market Monitoring"
9
9
  authors = [
10
10
  "Domingo Bertus <hello@veanu.ch>",
@@ -1,31 +0,0 @@
1
- from pathlib import Path
2
- from typing import List
3
-
4
- # Generic settings
5
- MAX_RETRIES = 3
6
- RETRY_DELAY = 2
7
- ROOT_DIR = Path(__file__).parents[1]
8
-
9
- # Serp settings
10
- GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
11
- GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
12
- SERP_DEFAULT_COUNTRY_CODES: List[str] = [
13
- # ".com",
14
- ]
15
-
16
- # Enrichment settings
17
- ENRICHMENT_DEFAULT_LIMIT = 10
18
-
19
- # Zyte settings
20
- ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
21
-
22
- # Processor settings
23
- PROCESSOR_DEFAULT_MODEL = "gpt-4o"
24
- PROCESSOR_DEFAULT_IF_MISSING = -1
25
- PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
26
- PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
27
-
28
- # Async settings
29
- DEFAULT_N_SERP_WKRS = 10
30
- DEFAULT_N_ZYTE_WKRS = 10
31
- DEFAULT_N_PROC_WKRS = 10
File without changes
File without changes
@@ -4,10 +4,10 @@ import logging
4
4
  from pydantic import BaseModel
5
5
  from typing import List
6
6
  from urllib.parse import urlparse
7
+ import re
7
8
 
8
9
  from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
9
10
  from fraudcrawler.base.base import Host, Language, Location, AsyncClient
10
- import re
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13