fraudcrawler 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/base/base.py CHANGED
@@ -63,6 +63,14 @@ class Host(BaseModel):
63
63
  return [cls._normalize_domain(dom.strip()) for dom in val]
64
64
 
65
65
 
66
+ class ClassificationResult(BaseModel):
67
+ """Model for classification results."""
68
+
69
+ result: int
70
+ input_tokens: int
71
+ output_tokens: int
72
+
73
+
66
74
  class Location(BaseModel):
67
75
  """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
68
76
 
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
137
145
  # Processor parameters are set dynamic so we must allow extra fields
138
146
  classifications: Dict[str, int] = Field(default_factory=dict)
139
147
 
148
+ # Usage parameters
149
+ usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
150
+
140
151
  # Filtering parameters
141
152
  filtered: bool = False
142
153
  filtered_at_stage: str | None = None
@@ -281,7 +281,13 @@ class Orchestrator(ABC):
281
281
  url=url,
282
282
  product_details=product_details,
283
283
  )
284
- product.classifications[prompt.name] = classification
284
+ product.classifications[prompt.name] = int(
285
+ classification.result
286
+ )
287
+ product.usage[prompt.name] = {
288
+ "input_tokens": classification.input_tokens,
289
+ "output_tokens": classification.output_tokens,
290
+ }
285
291
  except Exception as e:
286
292
  logger.warning(f"Error processing product: {e}.")
287
293
 
@@ -2,10 +2,11 @@ import logging
2
2
 
3
3
  from openai import AsyncOpenAI
4
4
 
5
- from fraudcrawler.base.base import Prompt
5
+ from fraudcrawler.base.base import Prompt, ClassificationResult
6
6
  from fraudcrawler.settings import (
7
7
  PROCESSOR_USER_PROMPT_TEMPLATE,
8
8
  PROCESSOR_DEFAULT_IF_MISSING,
9
+ PROCESSOR_EMPTY_TOKEN_COUNT,
9
10
  )
10
11
 
11
12
 
@@ -20,6 +21,7 @@ class Processor:
20
21
  api_key: str,
21
22
  model: str,
22
23
  default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
24
+ empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
23
25
  ):
24
26
  """Initializes the Processor.
25
27
 
@@ -27,17 +29,22 @@ class Processor:
27
29
  api_key: The OpenAI API key.
28
30
  model: The OpenAI model to use.
29
31
  default_if_missing: The default classification to return if error occurs.
32
+ empty_token_count: The default value to return as tokensif the classification is empty.
30
33
  """
31
34
  self._client = AsyncOpenAI(api_key=api_key)
32
35
  self._model = model
33
- self._default_if_missing = default_if_missing
36
+ self._error_response = ClassificationResult(
37
+ result=default_if_missing,
38
+ input_tokens=empty_token_count,
39
+ output_tokens=empty_token_count,
40
+ )
34
41
 
35
42
  async def _call_openai_api(
36
43
  self,
37
44
  system_prompt: str,
38
45
  user_prompt: str,
39
46
  **kwargs,
40
- ) -> str:
47
+ ) -> ClassificationResult:
41
48
  """Calls the OpenAI API with the given user prompt."""
42
49
  response = await self._client.chat.completions.create(
43
50
  model=self._model,
@@ -50,10 +57,24 @@ class Processor:
50
57
  content = response.choices[0].message.content
51
58
  if not content:
52
59
  raise ValueError("Empty response from OpenAI API")
53
- return content
54
60
 
55
- async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
56
- """A generic classification method that classifies a product based on a prompt object.
61
+ # Convert the content to an integer
62
+ content = int(content.strip())
63
+
64
+ # For tracking consumption we alre return the tokens used
65
+ classification = ClassificationResult(
66
+ result=content,
67
+ input_tokens=response.usage.prompt_tokens,
68
+ output_tokens=response.usage.completion_tokens,
69
+ )
70
+
71
+ return classification
72
+
73
+ async def classify(
74
+ self, prompt: Prompt, url: str, product_details: str
75
+ ) -> ClassificationResult:
76
+ """A generic classification method that classifies a product based on a prompt object and returns
77
+ the classification, input tokens, and output tokens.
57
78
 
58
79
  Args:
59
80
  prompt: A dictionary with keys "system_prompt", etc.
@@ -69,7 +90,7 @@ class Processor:
69
90
  # If required fields are missing, return the prompt's default fallback if provided.
70
91
  if not product_details:
71
92
  logger.warning("Missing required product_details for classification.")
72
- return self._default_if_missing
93
+ return self._error_response
73
94
 
74
95
  # Substitute placeholders in user_prompt with the relevant arguments
75
96
  user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
@@ -81,22 +102,21 @@ class Processor:
81
102
  logger.debug(
82
103
  f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
83
104
  )
84
- content = await self._call_openai_api(
105
+ classification = await self._call_openai_api(
85
106
  system_prompt=prompt.system_prompt,
86
107
  user_prompt=user_prompt,
87
108
  max_tokens=1,
88
109
  )
89
- classification = int(content.strip())
90
110
 
91
111
  # Enforce that the classification is in the allowed classes
92
- if classification not in prompt.allowed_classes:
112
+ if classification.result not in prompt.allowed_classes:
93
113
  logger.warning(
94
- f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
114
+ f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
95
115
  )
96
- return self._default_if_missing
116
+ return self._error_response
97
117
 
98
118
  logger.info(
99
- f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
119
+ f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
100
120
  )
101
121
  return classification
102
122
 
@@ -104,4 +124,4 @@ class Processor:
104
124
  logger.error(
105
125
  f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
106
126
  )
107
- return self._default_if_missing
127
+ return self._error_response
fraudcrawler/settings.py CHANGED
@@ -15,14 +15,43 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
15
15
 
16
16
  # URL De-duplication settings
17
17
  KNOWN_TRACKERS = [
18
- "srsltid",
19
- "utm_source",
20
- "utm_medium",
21
- "utm_campaign",
22
- "utm_term",
23
- "utm_content",
24
- "ar",
25
- "ps",
18
+ "srsltid", # Search result click ID (used by some search engines)
19
+ "utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
20
+ "utm_medium", # UTM: Medium such as CPC, email, social
21
+ "utm_campaign", # UTM: Campaign name (e.g., summer_sale)
22
+ "utm_term", # UTM: Keyword term (used in paid search)
23
+ "utm_content", # UTM: Used to differentiate similar links or ads
24
+ "ar", # Often used for ad region or targeting info
25
+ "ps", # Could refer to promotion source or partner segment
26
+ "gclid", # Google Ads click ID (auto-tagging)
27
+ "gclsrc", # Source of the GCLID (e.g., ads, search)
28
+ "sku", # Product SKU identifier, often used in ecommerce links
29
+ "ref", # Referrer username or source (e.g., GitHub ref links)
30
+ "referral", # Alternate form of referrer, often human-readable
31
+ "aff_id", # Affiliate identifier (ID-based)
32
+ "aff", # Short form for affiliate tag
33
+ "affiliate", # Affiliate tracking parameter (human-readable)
34
+ "partner", # Indicates marketing or distribution partner
35
+ "fbclid", # Facebook Click Identifier
36
+ "msclkid", # Microsoft/Bing Ads click identifier
37
+ "twclid", # Twitter Ads click identifier
38
+ "variant", # A/B test variant (used to test versions of pages)
39
+ "session_id", # Session tracking ID, should not persist across URLs
40
+ "track", # Generic flag used to enable/disable tracking
41
+ "cid", # Campaign ID (used in ads or emails)
42
+ "campaignid", # Alternate or long-form campaign ID
43
+ "adgroup", # Ad group identifier for campaigns
44
+ "bannerid", # Specific banner ad ID (for display ad tracking)
45
+ "token", # Often used to identify users or temporary sessions
46
+ "tag", # Affiliate or marketing tag (used for tracking)
47
+ "hash", # Generic hash identifier, often for state or cache
48
+ "user", # User ID or identifier passed in URL (should be avoided)
49
+ "src", # Generic source indicator, less formal than `utm_source`
50
+ "selsort", # Sorting parameter for search results
51
+ "shid", # Shop ID (used in ecommerce)
52
+ "shoparea", # Shop area (used in ecommerce)
53
+ "shopid", # Shop ID (used in ecommerce)
54
+ "shoparea", # Shop area (used in ecommerce)
26
55
  ]
27
56
 
28
57
  # Enrichment settings
@@ -34,6 +63,7 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
34
63
  # Processor settings
35
64
  PROCESSOR_DEFAULT_MODEL = "gpt-4o"
36
65
  PROCESSOR_DEFAULT_IF_MISSING = -1
66
+ PROCESSOR_EMPTY_TOKEN_COUNT = -1
37
67
  PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
38
68
  PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
39
69
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.4.3
3
+ Version: 0.4.5
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -11,7 +11,6 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
14
  Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
16
15
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
17
16
  Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -1,21 +1,21 @@
1
1
  fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
2
2
  fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=JWjZ3mpX4caQAsWKYqtHrUqHfHr6GXlAaEjxxHV9ODQ,6020
3
+ fraudcrawler/base/base.py,sha256=pYGdRV_Ssw5fA6tLVhlZwAO0OLQl6qn6LgJPCzOCrpc,6258
4
4
  fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
5
5
  fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
6
  fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=xOMxA0zPUXSF8AGY5AUqzsOO9LfRIjxI2HuZf__Z_sI,24689
7
+ fraudcrawler/base/orchestrator.py,sha256=cum23GYZHzaivFi8ZAIvbzCkpwwdvyhxBROAaDpTEeM,24984
8
8
  fraudcrawler/launch_demo_pipeline.py,sha256=CX4A-E63ER7Ip9RNI_IyTAXerYXcQ-NoSvhvLDLdP-s,4640
9
9
  fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- fraudcrawler/processing/processor.py,sha256=An2orst0YRIav7bFuoDMgjwWz2Z9dyjVUbkNAMXNTTo,3748
10
+ fraudcrawler/processing/processor.py,sha256=LM9aSJiVskrwqdoI7FnEglM6zK8fW6e5bhp8ecUuD0E,4633
11
11
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
13
13
  fraudcrawler/scraping/serp.py,sha256=divEp1UBUsws24PWZABhWIxOmaLqLwdeGn4KNrqWkYA,17865
14
14
  fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
15
15
  fraudcrawler/scraping/zyte.py,sha256=DUF5pIwpZyQw30qURnFxtp8KYpUgBkrXjM7RaVGH92Q,7005
16
- fraudcrawler/settings.py,sha256=31jvRFfB-gsVbeidLLl4iQgrFL7GH-824lerIniPI08,1017
17
- fraudcrawler-0.4.3.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
18
- fraudcrawler-0.4.3.dist-info/METADATA,sha256=jlk2WdtXEK0-s6QRQdI96EBpQiyHWKgJiYeW93yiU24,5931
19
- fraudcrawler-0.4.3.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
20
- fraudcrawler-0.4.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
21
- fraudcrawler-0.4.3.dist-info/RECORD,,
16
+ fraudcrawler/settings.py,sha256=XL4hnTO-Ks4BY_SRl8zdSUg5aUN3ZXyDg5kHx_xflJo,3365
17
+ fraudcrawler-0.4.5.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
18
+ fraudcrawler-0.4.5.dist-info/METADATA,sha256=N9Kjjf5YwqzqP6jyq-onZkE_rEuTcmdiUawj6baHEHk,5880
19
+ fraudcrawler-0.4.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
20
+ fraudcrawler-0.4.5.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
21
+ fraudcrawler-0.4.5.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.0
2
+ Generator: poetry-core 1.9.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any