fraudcrawler 0.3.3__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/PKG-INFO +2 -2
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/README.md +1 -1
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/base/base.py +12 -3
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/base/orchestrator.py +12 -4
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/processing/processor.py +16 -6
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/settings.py +0 -3
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/pyproject.toml +1 -1
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/LICENSE +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/scraping/serp.py +0 -0
- {fraudcrawler-0.3.3 → fraudcrawler-0.3.5}/fraudcrawler/scraping/zyte.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -68,7 +68,7 @@ The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')
|
|
|
68
68
|
Defines the search depth with the number of results to retrieve and optional enrichment parameters.
|
|
69
69
|
|
|
70
70
|
#### `prompts: List[Prompt]`
|
|
71
|
-
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), `allowed_classes` (a list of possible classes)
|
|
71
|
+
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), and `allowed_classes` (a list of possible classes).
|
|
72
72
|
|
|
73
73
|
```python
|
|
74
74
|
from fraudcrawler import Language, Location, Deepness, Prompt
|
|
@@ -46,7 +46,7 @@ The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')
|
|
|
46
46
|
Defines the search depth with the number of results to retrieve and optional enrichment parameters.
|
|
47
47
|
|
|
48
48
|
#### `prompts: List[Prompt]`
|
|
49
|
-
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), `allowed_classes` (a list of possible classes)
|
|
49
|
+
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), and `allowed_classes` (a list of possible classes).
|
|
50
50
|
|
|
51
51
|
```python
|
|
52
52
|
from fraudcrawler import Language, Location, Deepness, Prompt
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
from pydantic import
|
|
3
|
+
from pydantic import (
|
|
4
|
+
BaseModel,
|
|
5
|
+
field_validator,
|
|
6
|
+
model_validator,
|
|
7
|
+
)
|
|
4
8
|
from pydantic_settings import BaseSettings
|
|
5
9
|
from typing import List
|
|
6
10
|
|
|
@@ -9,7 +13,6 @@ import aiohttp
|
|
|
9
13
|
from fraudcrawler.settings import (
|
|
10
14
|
GOOGLE_LANGUAGES_FILENAME,
|
|
11
15
|
GOOGLE_LOCATIONS_FILENAME,
|
|
12
|
-
PROCESSOR_DEFAULT_IF_MISSING,
|
|
13
16
|
)
|
|
14
17
|
|
|
15
18
|
logger = logging.getLogger(__name__)
|
|
@@ -111,7 +114,13 @@ class Prompt(BaseModel):
|
|
|
111
114
|
context: str
|
|
112
115
|
system_prompt: str
|
|
113
116
|
allowed_classes: List[int]
|
|
114
|
-
|
|
117
|
+
|
|
118
|
+
@field_validator("allowed_classes", mode="before")
|
|
119
|
+
def check_for_positive_value(cls, val):
|
|
120
|
+
"""Check if all values are positive."""
|
|
121
|
+
if not all(isinstance(i, int) and i >= 0 for i in val):
|
|
122
|
+
raise ValueError("all values in allowed_classes must be positive integers.")
|
|
123
|
+
return val
|
|
115
124
|
|
|
116
125
|
|
|
117
126
|
class AsyncClient:
|
|
@@ -4,13 +4,17 @@ import logging
|
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
5
|
from typing import Dict, List, Set, cast
|
|
6
6
|
|
|
7
|
-
from fraudcrawler.settings import
|
|
7
|
+
from fraudcrawler.settings import (
|
|
8
|
+
PROCESSOR_DEFAULT_MODEL,
|
|
9
|
+
PROCESSOR_DEFAULT_IF_MISSING,
|
|
10
|
+
MAX_RETRIES,
|
|
11
|
+
RETRY_DELAY,
|
|
12
|
+
)
|
|
8
13
|
from fraudcrawler.settings import (
|
|
9
14
|
DEFAULT_N_SERP_WKRS,
|
|
10
15
|
DEFAULT_N_ZYTE_WKRS,
|
|
11
16
|
DEFAULT_N_PROC_WKRS,
|
|
12
17
|
)
|
|
13
|
-
from fraudcrawler.settings import PRODUCT_ITEM_DEFAULT_IS_RELEVANT
|
|
14
18
|
from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
|
|
15
19
|
from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
|
|
16
20
|
|
|
@@ -40,7 +44,6 @@ class ProductItem(BaseModel):
|
|
|
40
44
|
# Filtering parameters
|
|
41
45
|
filtered: bool = False
|
|
42
46
|
filtered_at_stage: str | None = None
|
|
43
|
-
is_relevant: int = PRODUCT_ITEM_DEFAULT_IS_RELEVANT
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
class Orchestrator(ABC):
|
|
@@ -69,6 +72,7 @@ class Orchestrator(ABC):
|
|
|
69
72
|
openai_model: str = PROCESSOR_DEFAULT_MODEL,
|
|
70
73
|
max_retries: int = MAX_RETRIES,
|
|
71
74
|
retry_delay: int = RETRY_DELAY,
|
|
75
|
+
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
|
|
72
76
|
n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
|
|
73
77
|
n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
|
|
74
78
|
n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
|
|
@@ -100,7 +104,11 @@ class Orchestrator(ABC):
|
|
|
100
104
|
self._zyteapi = ZyteApi(
|
|
101
105
|
api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
102
106
|
)
|
|
103
|
-
self._processor = Processor(
|
|
107
|
+
self._processor = Processor(
|
|
108
|
+
api_key=openaiapi_key,
|
|
109
|
+
model=openai_model,
|
|
110
|
+
default_if_missing=default_if_missing,
|
|
111
|
+
)
|
|
104
112
|
|
|
105
113
|
# Setup the async framework
|
|
106
114
|
self._n_serp_wkrs = n_serp_wkrs
|
|
@@ -3,7 +3,10 @@ import logging
|
|
|
3
3
|
from openai import AsyncOpenAI
|
|
4
4
|
|
|
5
5
|
from fraudcrawler.base.base import Prompt
|
|
6
|
-
from fraudcrawler.settings import
|
|
6
|
+
from fraudcrawler.settings import (
|
|
7
|
+
PROCESSOR_USER_PROMPT_TEMPLATE,
|
|
8
|
+
PROCESSOR_DEFAULT_IF_MISSING,
|
|
9
|
+
)
|
|
7
10
|
|
|
8
11
|
|
|
9
12
|
logger = logging.getLogger(__name__)
|
|
@@ -12,15 +15,22 @@ logger = logging.getLogger(__name__)
|
|
|
12
15
|
class Processor:
|
|
13
16
|
"""Processes product data for classification based on a prompt configuration."""
|
|
14
17
|
|
|
15
|
-
def __init__(
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
api_key: str,
|
|
21
|
+
model: str,
|
|
22
|
+
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
|
|
23
|
+
):
|
|
16
24
|
"""Initializes the Processor.
|
|
17
25
|
|
|
18
26
|
Args:
|
|
19
27
|
api_key: The OpenAI API key.
|
|
20
28
|
model: The OpenAI model to use.
|
|
29
|
+
default_if_missing: The default classification to return if error occurs.
|
|
21
30
|
"""
|
|
22
31
|
self._client = AsyncOpenAI(api_key=api_key)
|
|
23
32
|
self._model = model
|
|
33
|
+
self._default_if_missing = default_if_missing
|
|
24
34
|
|
|
25
35
|
async def _call_openai_api(
|
|
26
36
|
self,
|
|
@@ -54,7 +64,7 @@ class Processor:
|
|
|
54
64
|
description: Product description (often used in the user_prompt).
|
|
55
65
|
|
|
56
66
|
Note:
|
|
57
|
-
This method returns `
|
|
67
|
+
This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
|
|
58
68
|
- 'name' or 'description' is None
|
|
59
69
|
- an error occurs during the API call
|
|
60
70
|
- if the response isn't in allowed_classes.
|
|
@@ -64,7 +74,7 @@ class Processor:
|
|
|
64
74
|
logger.warning(
|
|
65
75
|
f"Missing required fields for classification: name='{name}', description='{description}'"
|
|
66
76
|
)
|
|
67
|
-
return
|
|
77
|
+
return self._default_if_missing
|
|
68
78
|
|
|
69
79
|
# Substitute placeholders in user_prompt with the relevant arguments
|
|
70
80
|
user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
|
|
@@ -91,7 +101,7 @@ class Processor:
|
|
|
91
101
|
logger.warning(
|
|
92
102
|
f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
|
|
93
103
|
)
|
|
94
|
-
return
|
|
104
|
+
return self._default_if_missing
|
|
95
105
|
|
|
96
106
|
logger.info(
|
|
97
107
|
f'Classification for "{name}" (prompt={prompt.name}): {classification}'
|
|
@@ -102,4 +112,4 @@ class Processor:
|
|
|
102
112
|
logger.error(
|
|
103
113
|
f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
|
|
104
114
|
)
|
|
105
|
-
return
|
|
115
|
+
return self._default_if_missing
|
|
@@ -22,9 +22,6 @@ PROCESSOR_USER_PROMPT_TEMPLATE = (
|
|
|
22
22
|
"Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
|
|
23
23
|
)
|
|
24
24
|
|
|
25
|
-
# Orchestrator settings
|
|
26
|
-
PRODUCT_ITEM_DEFAULT_IS_RELEVANT = -1
|
|
27
|
-
|
|
28
25
|
# Async settings
|
|
29
26
|
DEFAULT_N_SERP_WKRS = 10
|
|
30
27
|
DEFAULT_N_ZYTE_WKRS = 10
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|