fraudcrawler 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/base/base.py CHANGED
@@ -1,6 +1,10 @@
1
1
  import json
2
2
  import logging
3
- from pydantic import BaseModel, field_validator, model_validator
3
+ from pydantic import (
4
+ BaseModel,
5
+ field_validator,
6
+ model_validator,
7
+ )
4
8
  from pydantic_settings import BaseSettings
5
9
  from typing import List
6
10
 
@@ -9,7 +13,6 @@ import aiohttp
9
13
  from fraudcrawler.settings import (
10
14
  GOOGLE_LANGUAGES_FILENAME,
11
15
  GOOGLE_LOCATIONS_FILENAME,
12
- PROCESSOR_DEFAULT_IF_MISSING,
13
16
  )
14
17
 
15
18
  logger = logging.getLogger(__name__)
@@ -111,7 +114,13 @@ class Prompt(BaseModel):
111
114
  context: str
112
115
  system_prompt: str
113
116
  allowed_classes: List[int]
114
- default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING
117
+
118
+ @field_validator("allowed_classes", mode="before")
119
+ def check_for_positive_value(cls, val):
120
+ """Check if all values are positive."""
121
+ if not all(isinstance(i, int) and i >= 0 for i in val):
122
+ raise ValueError("all values in allowed_classes must be positive integers.")
123
+ return val
115
124
 
116
125
 
117
126
  class AsyncClient:
@@ -4,13 +4,17 @@ import logging
4
4
  from pydantic import BaseModel, Field
5
5
  from typing import Dict, List, Set, cast
6
6
 
7
- from fraudcrawler.settings import PROCESSOR_DEFAULT_MODEL, MAX_RETRIES, RETRY_DELAY
7
+ from fraudcrawler.settings import (
8
+ PROCESSOR_DEFAULT_MODEL,
9
+ PROCESSOR_DEFAULT_IF_MISSING,
10
+ MAX_RETRIES,
11
+ RETRY_DELAY,
12
+ )
8
13
  from fraudcrawler.settings import (
9
14
  DEFAULT_N_SERP_WKRS,
10
15
  DEFAULT_N_ZYTE_WKRS,
11
16
  DEFAULT_N_PROC_WKRS,
12
17
  )
13
- from fraudcrawler.settings import PRODUCT_ITEM_DEFAULT_IS_RELEVANT
14
18
  from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
15
19
  from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
16
20
 
@@ -40,7 +44,6 @@ class ProductItem(BaseModel):
40
44
  # Filtering parameters
41
45
  filtered: bool = False
42
46
  filtered_at_stage: str | None = None
43
- is_relevant: int = PRODUCT_ITEM_DEFAULT_IS_RELEVANT
44
47
 
45
48
 
46
49
  class Orchestrator(ABC):
@@ -69,6 +72,7 @@ class Orchestrator(ABC):
69
72
  openai_model: str = PROCESSOR_DEFAULT_MODEL,
70
73
  max_retries: int = MAX_RETRIES,
71
74
  retry_delay: int = RETRY_DELAY,
75
+ default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
72
76
  n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
73
77
  n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
74
78
  n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
@@ -100,7 +104,11 @@ class Orchestrator(ABC):
100
104
  self._zyteapi = ZyteApi(
101
105
  api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
102
106
  )
103
- self._processor = Processor(api_key=openaiapi_key, model=openai_model)
107
+ self._processor = Processor(
108
+ api_key=openaiapi_key,
109
+ model=openai_model,
110
+ default_if_missing=default_if_missing,
111
+ )
104
112
 
105
113
  # Setup the async framework
106
114
  self._n_serp_wkrs = n_serp_wkrs
@@ -3,7 +3,10 @@ import logging
3
3
  from openai import AsyncOpenAI
4
4
 
5
5
  from fraudcrawler.base.base import Prompt
6
- from fraudcrawler.settings import PROCESSOR_USER_PROMPT_TEMPLATE
6
+ from fraudcrawler.settings import (
7
+ PROCESSOR_USER_PROMPT_TEMPLATE,
8
+ PROCESSOR_DEFAULT_IF_MISSING,
9
+ )
7
10
 
8
11
 
9
12
  logger = logging.getLogger(__name__)
@@ -12,15 +15,22 @@ logger = logging.getLogger(__name__)
12
15
  class Processor:
13
16
  """Processes product data for classification based on a prompt configuration."""
14
17
 
15
- def __init__(self, api_key: str, model: str):
18
+ def __init__(
19
+ self,
20
+ api_key: str,
21
+ model: str,
22
+ default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
23
+ ):
16
24
  """Initializes the Processor.
17
25
 
18
26
  Args:
19
27
  api_key: The OpenAI API key.
20
28
  model: The OpenAI model to use.
29
+ default_if_missing: The default classification to return if error occurs.
21
30
  """
22
31
  self._client = AsyncOpenAI(api_key=api_key)
23
32
  self._model = model
33
+ self._default_if_missing = default_if_missing
24
34
 
25
35
  async def _call_openai_api(
26
36
  self,
@@ -54,7 +64,7 @@ class Processor:
54
64
  description: Product description (often used in the user_prompt).
55
65
 
56
66
  Note:
57
- This method returns `prompt.default_if_missing` if:
67
+ This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
58
68
  - 'name' or 'description' is None
59
69
  - an error occurs during the API call
60
70
  - if the response isn't in allowed_classes.
@@ -64,7 +74,7 @@ class Processor:
64
74
  logger.warning(
65
75
  f"Missing required fields for classification: name='{name}', description='{description}'"
66
76
  )
67
- return prompt.default_if_missing
77
+ return self._default_if_missing
68
78
 
69
79
  # Substitute placeholders in user_prompt with the relevant arguments
70
80
  user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
@@ -91,7 +101,7 @@ class Processor:
91
101
  logger.warning(
92
102
  f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
93
103
  )
94
- return prompt.default_if_missing
104
+ return self._default_if_missing
95
105
 
96
106
  logger.info(
97
107
  f'Classification for "{name}" (prompt={prompt.name}): {classification}'
@@ -102,4 +112,4 @@ class Processor:
102
112
  logger.error(
103
113
  f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
104
114
  )
105
- return prompt.default_if_missing
115
+ return self._default_if_missing
fraudcrawler/settings.py CHANGED
@@ -22,9 +22,6 @@ PROCESSOR_USER_PROMPT_TEMPLATE = (
22
22
  "Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
23
23
  )
24
24
 
25
- # Orchestrator settings
26
- PRODUCT_ITEM_DEFAULT_IS_RELEVANT = -1
27
-
28
25
  # Async settings
29
26
  DEFAULT_N_SERP_WKRS = 10
30
27
  DEFAULT_N_ZYTE_WKRS = 10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -68,7 +68,7 @@ The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')
68
68
  Defines the search depth with the number of results to retrieve and optional enrichment parameters.
69
69
 
70
70
  #### `prompts: List[Prompt]`
71
- The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), `allowed_classes` (a list of possible classes) and optionally `default_if_missing` (a default class if anything goes wrong).
71
+ The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), and `allowed_classes` (a list of possible classes).
72
72
 
73
73
  ```python
74
74
  from fraudcrawler import Language, Location, Deepness, Prompt
@@ -1,20 +1,20 @@
1
1
  fraudcrawler/__init__.py,sha256=2EgoTb2jNcQt1NxUV8za0154kb7ZnHZ_KeKgx21rdFs,679
2
2
  fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=YgX7cUB3Fta-sXWZu5I-gn85sCfpmoa8M67Whn1m56o,4241
3
+ fraudcrawler/base/base.py,sha256=KnwOcy35EKyelcgVh95LmOZziWFS6dKlegLK6A96wvg,4485
4
4
  fraudcrawler/base/client.py,sha256=GcTUMqLfvweLFdHy6CP9tgxsFQiPkc6KyiLcwLnDiw8,4412
5
5
  fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
6
  fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=GmJvrP9jKr4FqTKhuU9YMEuZ54gV0asHrSCxhM43onA,23903
7
+ fraudcrawler/base/orchestrator.py,sha256=Gmryv8l8nB1QUwwjLoZGop2mwKqWYQQORT_96_w5ptA,23981
8
8
  fraudcrawler/launch_demo_pipeline.py,sha256=RIZTtdtZeJPhvSLp1IUjT_nhme_2q6mAGWKoL838E4E,4320
9
9
  fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- fraudcrawler/processing/processor.py,sha256=sNYK0gY7PsJJS5FMYOiHLXEQJ70buifSokuIiXk5dG4,3715
10
+ fraudcrawler/processing/processor.py,sha256=IFVKIiNi0QoCAgPFkFtNDgxfhh01iDNUyIBZWACplR8,3993
11
11
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
13
13
  fraudcrawler/scraping/serp.py,sha256=wT8vhk0EugcrS2CCvMuCCZrlw1MRI-ahtGYKdNUZQo8,8830
14
14
  fraudcrawler/scraping/zyte.py,sha256=ggI4iYG-E_UyiKgUpEFekeUd1giifEfJ_uyFUSJGSLY,6296
15
- fraudcrawler/settings.py,sha256=eUb7zd2Q7fYUrLk4cl_d48lZ9zaB8iU7M0zFFuZc_-g,786
16
- fraudcrawler-0.3.3.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
17
- fraudcrawler-0.3.3.dist-info/METADATA,sha256=860K3oeNRoAC2Fmr9D4Gd1DYaXw1KOaWSEBeByL4V-U,6030
18
- fraudcrawler-0.3.3.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
19
- fraudcrawler-0.3.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
20
- fraudcrawler-0.3.3.dist-info/RECORD,,
15
+ fraudcrawler/settings.py,sha256=yAgGvZ9wAdaYbN5c0SBZoTUkjjLOyU2je1109qcbTzQ,723
16
+ fraudcrawler-0.3.5.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
17
+ fraudcrawler-0.3.5.dist-info/METADATA,sha256=z0wINs19mCGOFPrXlb4FRSivMQpWG5zTgFmCXu6pIE8,5957
18
+ fraudcrawler-0.3.5.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
19
+ fraudcrawler-0.3.5.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
20
+ fraudcrawler-0.3.5.dist-info/RECORD,,