fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,54 @@
1
+ from tenacity import (
2
+ AsyncRetrying,
3
+ retry_if_exception,
4
+ stop_after_attempt,
5
+ wait_exponential_jitter,
6
+ )
7
+
8
+ from fraudcrawler.settings import (
9
+ RETRY_STOP_AFTER_ATTEMPT,
10
+ RETRY_INITIAL_DELAY,
11
+ RETRY_MAX_DELAY,
12
+ RETRY_EXP_BASE,
13
+ RETRY_JITTER,
14
+ RETRY_SKIP_IF_CODE,
15
+ )
16
+
17
+
18
+ def _is_retryable_exception(err: BaseException) -> bool:
19
+ """Checks if failing HTTP connection is worth to be re-tried."""
20
+
21
+ # Get status_code from err
22
+ response = getattr(err, "response", None)
23
+ if response is not None:
24
+ status_code = getattr(response, "status_code", None)
25
+ else:
26
+ status_code = getattr(err, "status_code", None)
27
+
28
+ # Check if we skip retry
29
+ if status_code is not None and status_code in RETRY_SKIP_IF_CODE:
30
+ return False
31
+
32
+ # Else we do try it again
33
+ return True
34
+
35
+
36
+ def get_async_retry(
37
+ stop_after: int = RETRY_STOP_AFTER_ATTEMPT,
38
+ initial_delay: int = RETRY_INITIAL_DELAY,
39
+ max_delay: int = RETRY_MAX_DELAY,
40
+ exp_base: int = RETRY_EXP_BASE,
41
+ jitter: int = RETRY_JITTER,
42
+ ) -> AsyncRetrying:
43
+ """returns the retry configuration for async operations."""
44
+ return AsyncRetrying(
45
+ retry=retry_if_exception(_is_retryable_exception),
46
+ stop=stop_after_attempt(stop_after),
47
+ wait=wait_exponential_jitter(
48
+ initial=initial_delay,
49
+ max=max_delay,
50
+ exp_base=exp_base,
51
+ jitter=jitter,
52
+ ),
53
+ reraise=True,
54
+ )
@@ -0,0 +1,162 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Sequence
4
+
5
+ from fraudcrawler.base.base import Setup
6
+ from fraudcrawler import (
7
+ FraudCrawlerClient,
8
+ HttpxAsyncClient,
9
+ Searcher,
10
+ Enricher,
11
+ URLCollector,
12
+ ZyteAPI,
13
+ SearchEngineName,
14
+ Language,
15
+ Location,
16
+ Deepness,
17
+ Processor,
18
+ Workflow,
19
+ OpenAIClassification,
20
+ )
21
+
22
+ LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
23
+ LOG_LVL = "INFO"
24
+ DATE_FMT = "%Y-%m-%d %H:%M:%S"
25
+ SETUP = Setup() # type: ignore[call-arg]
26
+ logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
27
+
28
+
29
+ def _setup_workflows(http_client: HttpxAsyncClient) -> Sequence[Workflow]:
30
+ """Sets up the set of workflows to be run iteratively."""
31
+ _AVAILABILITY_SYSTEM_PROMPT = (
32
+ "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
33
+ "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
34
+ "You must consider all aspects of the given context and make a binary decision accordingly. "
35
+ "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
36
+ "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
37
+ "Respond only with the number 1 or 0."
38
+ )
39
+ _SERIOUSNESS_SYSTEM_PROMPT = (
40
+ "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
41
+ "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
42
+ " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
43
+ "within an online shop or marketplace.\n"
44
+ " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
45
+ " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
46
+ "exact product itself, classify as 0.\n"
47
+ " - Advertisements: Promotional content that doesn't directly sell a product.\n"
48
+ " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
49
+ " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
50
+ "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
51
+ )
52
+ return [
53
+ OpenAIClassification(
54
+ http_client=http_client,
55
+ name="availability",
56
+ api_key=SETUP.openaiapi_key,
57
+ model="gpt-4o",
58
+ product_item_fields=["product_name", "html_clean"],
59
+ system_prompt=_AVAILABILITY_SYSTEM_PROMPT,
60
+ allowed_classes=[0, 1],
61
+ ),
62
+ OpenAIClassification(
63
+ http_client=http_client,
64
+ name="seriousness",
65
+ api_key=SETUP.openaiapi_key,
66
+ model="gpt-4o",
67
+ product_item_fields=["product_name", "product_description"],
68
+ system_prompt=_SERIOUSNESS_SYSTEM_PROMPT,
69
+ allowed_classes=[0, 1],
70
+ ),
71
+ ]
72
+
73
+
74
+ async def run(http_client: HttpxAsyncClient, search_term: str):
75
+ # Setup the search
76
+ search_engines = list(SearchEngineName)
77
+ language = Language(name="German")
78
+ location = Location(name="Switzerland")
79
+ deepness = Deepness(num_results=10)
80
+
81
+ # # Optional: Add term enrichment
82
+ # from fraudcrawler import Enrichment
83
+
84
+ # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
85
+
86
+ # Optional: Add MARKETPLACES and EXCLUDED_URLS
87
+ from fraudcrawler import Host
88
+
89
+ # marketplaces = [
90
+ # Host(name="International", domains="zavamed.com,apomeds.com"),
91
+ # # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
92
+ # ]
93
+ excluded_urls = [
94
+ Host(name="Digitec", domains="digitec.ch"),
95
+ Host(name="Brack", domains="brack.ch"),
96
+ ]
97
+
98
+ # Setup clients
99
+ searcher = Searcher(
100
+ http_client=http_client,
101
+ serpapi_key=SETUP.serpapi_key,
102
+ zyteapi_key=SETUP.zyteapi_key,
103
+ )
104
+ enricher = Enricher(
105
+ http_client=http_client,
106
+ user=SETUP.dataforseo_user,
107
+ pwd=SETUP.dataforseo_pwd,
108
+ )
109
+ url_collector = URLCollector()
110
+ zyteapi = ZyteAPI(
111
+ http_client=http_client,
112
+ api_key=SETUP.zyteapi_key,
113
+ )
114
+ workflows = _setup_workflows(http_client=http_client)
115
+ processor = Processor(workflows=workflows)
116
+
117
+ # Setup the client
118
+ client = FraudCrawlerClient(
119
+ searcher=searcher,
120
+ enricher=enricher,
121
+ url_collector=url_collector,
122
+ zyteapi=zyteapi,
123
+ processor=processor,
124
+ )
125
+
126
+ # Execute the pipeline
127
+ await client.run(
128
+ search_term=search_term,
129
+ search_engines=search_engines,
130
+ language=language,
131
+ location=location,
132
+ deepness=deepness,
133
+ # marketplaces=marketplaces,
134
+ excluded_urls=excluded_urls,
135
+ )
136
+
137
+ # Show results
138
+ print()
139
+ title = "Available results"
140
+ print(title)
141
+ print("=" * len(title))
142
+ client.print_available_results()
143
+ print()
144
+ title = f'Results for "{search_term.upper()}"'
145
+ print(title)
146
+ print("=" * len(title))
147
+ df = client.load_results()
148
+ print(f"Number of products found: {len(df)}")
149
+ print()
150
+ n_head = 10
151
+ print(f"First {n_head} products are:")
152
+ print(df.head(n=n_head))
153
+ print()
154
+
155
+
156
+ async def main(search_term: str):
157
+ async with HttpxAsyncClient() as http_client:
158
+ await run(http_client=http_client, search_term=search_term)
159
+
160
+
161
+ if __name__ == "__main__":
162
+ asyncio.run(main(search_term="Kaffeebohnen"))
File without changes
@@ -0,0 +1,129 @@
1
+ from abc import ABC, abstractmethod
2
+ import logging
3
+ from pydantic import BaseModel
4
+ from typing import Any, Dict, List, Sequence, TypeAlias
5
+
6
+ from fraudcrawler.base.base import ProductItem
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ UserInputs: TypeAlias = Dict[str, List[str]]
12
+
13
+
14
+ class ClassificationResult(BaseModel):
15
+ """Model for classification results."""
16
+
17
+ result: int
18
+ input_tokens: int = 0
19
+ output_tokens: int = 0
20
+
21
+
22
+ class TmpResult(BaseModel):
23
+ """Model for tmp results."""
24
+
25
+ result: Any
26
+ input_tokens: int = 0
27
+ output_tokens: int = 0
28
+
29
+
30
+ WorkflowResult: TypeAlias = ClassificationResult | TmpResult | None
31
+
32
+
33
+ class Workflow(ABC):
34
+ """Abstract base class for independent processing workflows."""
35
+
36
+ def __init__(
37
+ self,
38
+ name: str,
39
+ ):
40
+ """Abstract base class for defining a classification workflow.
41
+
42
+ Args:
43
+ name: Name of the classification workflow.
44
+ """
45
+ self.name = name
46
+
47
+ @abstractmethod
48
+ async def run(self, product: ProductItem) -> WorkflowResult:
49
+ """Runs the workflow."""
50
+ pass
51
+
52
+
53
+ class Processor:
54
+ """Processing product items for a set of classification workflows."""
55
+
56
+ def __init__(self, workflows: Sequence[Workflow]):
57
+ """Initializes the Processor.
58
+
59
+ Args:
60
+ workflows: Sequence of workflows for classification of product items.
61
+ """
62
+ if not self._are_unique(workflows=workflows):
63
+ raise ValueError(
64
+ f"Workflow names are not unique: {[wf.name for wf in workflows]}"
65
+ )
66
+ self._workflows = workflows
67
+
68
+ @staticmethod
69
+ def _are_unique(workflows: Sequence[Workflow]) -> bool:
70
+ """Tests if the workflows have unique names."""
71
+ return len(workflows) == len(set([wf.name for wf in workflows]))
72
+
73
+ async def run(self, product: ProductItem) -> ProductItem:
74
+ """Run the processing step for multiple workflows and return all results together with workflow.name.
75
+
76
+ Args:
77
+ product: The product item to process.
78
+ """
79
+ for wf in self._workflows:
80
+ try:
81
+ logger.info(
82
+ f'Running workflow="{wf.name}" for product with url="{product.url_resolved}".'
83
+ )
84
+ res = await wf.run(product=product)
85
+ except Exception:
86
+ logger.error(
87
+ f'Error while running workflow="{wf.name}" for product with url="{product.url_resolved}"',
88
+ exc_info=True,
89
+ )
90
+ continue
91
+
92
+ # Update the product item
93
+ inp_tok = out_tok = 0
94
+ if isinstance(res, ClassificationResult):
95
+ logger.debug(
96
+ f'result from workflow="{wf.name}" added to product.classifications'
97
+ )
98
+ product.classifications[wf.name] = int(res.result)
99
+ inp_tok = res.input_tokens
100
+ out_tok = res.output_tokens
101
+
102
+ elif isinstance(res, TmpResult):
103
+ logger.debug(f'result from workflow="{wf.name}" added to product.tmp')
104
+ product.tmp[wf.name] = res
105
+ inp_tok = res.input_tokens
106
+ out_tok = res.output_tokens
107
+
108
+ elif res is None:
109
+ logger.debug(
110
+ f'result from workflow="{wf.name}" is `None` and therefore not stored'
111
+ )
112
+
113
+ else:
114
+ logger.warning(
115
+ f'result from workflow="{wf.name}" return type={type(res)} is not allowed; '
116
+ f"must either be of type `ClassificationResult`, "
117
+ f"`TmpResult`, or `None`; not type={type(res)}"
118
+ )
119
+
120
+ if inp_tok > 0 or out_tok > 0:
121
+ logger.debug(
122
+ f'result from workflow="{wf.name}" used input_tokens={inp_tok}, output_tokens={out_tok}'
123
+ )
124
+ product.usage[wf.name] = {
125
+ "input_tokens": inp_tok,
126
+ "output_tokens": out_tok,
127
+ }
128
+
129
+ return product