fraudcrawler 0.5.0__py3-none-any.whl → 0.7.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,3 @@
1
- from httpx import HTTPStatusError
2
1
  from tenacity import (
3
2
  AsyncRetrying,
4
3
  retry_if_exception,
@@ -17,24 +16,39 @@ from fraudcrawler.settings import (
17
16
 
18
17
 
19
18
  def _is_retryable_exception(err: BaseException) -> bool:
20
- if (
21
- isinstance(err, HTTPStatusError)
22
- and err.response.status_code in RETRY_SKIP_IF_CODE
23
- ):
19
+ """Checks if failing HTTP connection is worth to be re-tried."""
20
+
21
+ # Get status_code from err
22
+ response = getattr(err, "response", None)
23
+ if response is not None:
24
+ status_code = getattr(response, "status_code", None)
25
+ else:
26
+ status_code = getattr(err, "status_code", None)
27
+
28
+ # Check if we skip retry
29
+ if status_code is not None and status_code in RETRY_SKIP_IF_CODE:
24
30
  return False
31
+
32
+ # Else we do try it again
25
33
  return True
26
34
 
27
35
 
28
- def get_async_retry() -> AsyncRetrying:
36
+ def get_async_retry(
37
+ stop_after: int = RETRY_STOP_AFTER_ATTEMPT,
38
+ initial_delay: int = RETRY_INITIAL_DELAY,
39
+ max_delay: int = RETRY_MAX_DELAY,
40
+ exp_base: int = RETRY_EXP_BASE,
41
+ jitter: int = RETRY_JITTER,
42
+ ) -> AsyncRetrying:
29
43
  """returns the retry configuration for async operations."""
30
44
  return AsyncRetrying(
31
45
  retry=retry_if_exception(_is_retryable_exception),
32
- stop=stop_after_attempt(RETRY_STOP_AFTER_ATTEMPT),
46
+ stop=stop_after_attempt(stop_after),
33
47
  wait=wait_exponential_jitter(
34
- initial=RETRY_INITIAL_DELAY,
35
- max=RETRY_MAX_DELAY,
36
- exp_base=RETRY_EXP_BASE,
37
- jitter=RETRY_JITTER,
48
+ initial=initial_delay,
49
+ max=max_delay,
50
+ exp_base=exp_base,
51
+ jitter=jitter,
38
52
  ),
39
53
  reraise=True,
40
54
  )
@@ -1,55 +1,84 @@
1
+ import asyncio
1
2
  import logging
3
+ from typing import Sequence
2
4
 
3
- from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
5
+ from fraudcrawler.base.base import Setup
6
+ from fraudcrawler import (
7
+ FraudCrawlerClient,
8
+ HttpxAsyncClient,
9
+ Searcher,
10
+ Enricher,
11
+ URLCollector,
12
+ ZyteAPI,
13
+ SearchEngineName,
14
+ Language,
15
+ Location,
16
+ Deepness,
17
+ Processor,
18
+ Workflow,
19
+ OpenAIClassification,
20
+ )
4
21
 
5
22
  LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
6
23
  LOG_LVL = "INFO"
7
24
  DATE_FMT = "%Y-%m-%d %H:%M:%S"
25
+ SETUP = Setup() # type: ignore[call-arg]
8
26
  logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
9
27
 
10
28
 
11
- def search(search_term: str):
12
- # Setup the client
13
- client = FraudCrawlerClient()
14
-
15
- # Setup the search
16
- language = Language(name="German")
17
- location = Location(name="Switzerland")
18
- deepness = Deepness(num_results=10)
19
- prompts = [
20
- Prompt(
29
+ def _setup_workflows(http_client: HttpxAsyncClient) -> Sequence[Workflow]:
30
+ """Sets up the set of workflows to be run iteratively."""
31
+ _AVAILABILITY_SYSTEM_PROMPT = (
32
+ "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
33
+ "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
34
+ "You must consider all aspects of the given context and make a binary decision accordingly. "
35
+ "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
36
+ "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
37
+ "Respond only with the number 1 or 0."
38
+ )
39
+ _SERIOUSNESS_SYSTEM_PROMPT = (
40
+ "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
41
+ "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
42
+ " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
43
+ "within an online shop or marketplace.\n"
44
+ " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
45
+ " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
46
+ "exact product itself, classify as 0.\n"
47
+ " - Advertisements: Promotional content that doesn't directly sell a product.\n"
48
+ " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
49
+ " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
50
+ "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
51
+ )
52
+ return [
53
+ OpenAIClassification(
54
+ http_client=http_client,
21
55
  name="availability",
22
- system_prompt=(
23
- "You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
24
- "Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
25
- "You must consider all aspects of the given context and make a binary decision accordingly. "
26
- "If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
27
- "if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
28
- "Respond only with the number 1 or 0."
29
- ),
56
+ api_key=SETUP.openaiapi_key,
57
+ model="gpt-4o",
30
58
  product_item_fields=["product_name", "html_clean"],
59
+ system_prompt=_AVAILABILITY_SYSTEM_PROMPT,
60
+ allowed_classes=[0, 1],
61
+ ),
62
+ OpenAIClassification(
63
+ http_client=http_client,
64
+ name="seriousness",
65
+ api_key=SETUP.openaiapi_key,
66
+ model="gpt-4o",
67
+ product_item_fields=["product_name", "product_description"],
68
+ system_prompt=_SERIOUSNESS_SYSTEM_PROMPT,
31
69
  allowed_classes=[0, 1],
32
70
  ),
33
- # Prompt(
34
- # name="seriousness",
35
- # system_prompt=(
36
- # "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
37
- # "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
38
- # " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
39
- # "within an online shop or marketplace.\n"
40
- # " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
41
- # " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
42
- # "exact product itself, classify as 0.\n"
43
- # " - Advertisements: Promotional content that doesn't directly sell a product.\n"
44
- # " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
45
- # " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
46
- # "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
47
- # ),
48
- # product_item_fields=["product_name", "product_description"],
49
- # allowed_classes=[0, 1],
50
- # ),
51
71
  ]
52
- # # Optional: Add tern ENRICHEMENT
72
+
73
+
74
+ async def run(http_client: HttpxAsyncClient, search_term: str):
75
+ # Setup the search
76
+ search_engines = list(SearchEngineName)
77
+ language = Language(name="German")
78
+ location = Location(name="Switzerland")
79
+ deepness = Deepness(num_results=10)
80
+
81
+ # # Optional: Add term enrichment
53
82
  # from fraudcrawler import Enrichment
54
83
 
55
84
  # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
@@ -66,13 +95,41 @@ def search(search_term: str):
66
95
  Host(name="Brack", domains="brack.ch"),
67
96
  ]
68
97
 
98
+ # Setup clients
99
+ searcher = Searcher(
100
+ http_client=http_client,
101
+ serpapi_key=SETUP.serpapi_key,
102
+ zyteapi_key=SETUP.zyteapi_key,
103
+ )
104
+ enricher = Enricher(
105
+ http_client=http_client,
106
+ user=SETUP.dataforseo_user,
107
+ pwd=SETUP.dataforseo_pwd,
108
+ )
109
+ url_collector = URLCollector()
110
+ zyteapi = ZyteAPI(
111
+ http_client=http_client,
112
+ api_key=SETUP.zyteapi_key,
113
+ )
114
+ workflows = _setup_workflows(http_client=http_client)
115
+ processor = Processor(workflows=workflows)
116
+
117
+ # Setup the client
118
+ client = FraudCrawlerClient(
119
+ searcher=searcher,
120
+ enricher=enricher,
121
+ url_collector=url_collector,
122
+ zyteapi=zyteapi,
123
+ processor=processor,
124
+ )
125
+
69
126
  # Execute the pipeline
70
- client.execute(
127
+ await client.run(
71
128
  search_term=search_term,
129
+ search_engines=search_engines,
72
130
  language=language,
73
131
  location=location,
74
132
  deepness=deepness,
75
- prompts=prompts,
76
133
  # marketplaces=marketplaces,
77
134
  excluded_urls=excluded_urls,
78
135
  )
@@ -96,5 +153,10 @@ def search(search_term: str):
96
153
  print()
97
154
 
98
155
 
156
+ async def main(search_term: str):
157
+ async with HttpxAsyncClient() as http_client:
158
+ await run(http_client=http_client, search_term=search_term)
159
+
160
+
99
161
  if __name__ == "__main__":
100
- search(search_term='Liebherr "TP1410"')
162
+ asyncio.run(main(search_term="Kaffeebohnen"))
@@ -0,0 +1,129 @@
1
+ from abc import ABC, abstractmethod
2
+ import logging
3
+ from pydantic import BaseModel
4
+ from typing import Any, Dict, List, Sequence, TypeAlias
5
+
6
+ from fraudcrawler.base.base import ProductItem
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ UserInputs: TypeAlias = Dict[str, List[str]]
12
+
13
+
14
+ class ClassificationResult(BaseModel):
15
+ """Model for classification results."""
16
+
17
+ result: int
18
+ input_tokens: int = 0
19
+ output_tokens: int = 0
20
+
21
+
22
+ class TmpResult(BaseModel):
23
+ """Model for tmp results."""
24
+
25
+ result: Any
26
+ input_tokens: int = 0
27
+ output_tokens: int = 0
28
+
29
+
30
+ WorkflowResult: TypeAlias = ClassificationResult | TmpResult | None
31
+
32
+
33
+ class Workflow(ABC):
34
+ """Abstract base class for independent processing workflows."""
35
+
36
+ def __init__(
37
+ self,
38
+ name: str,
39
+ ):
40
+ """Abstract base class for defining a classification workflow.
41
+
42
+ Args:
43
+ name: Name of the classification workflow.
44
+ """
45
+ self.name = name
46
+
47
+ @abstractmethod
48
+ async def run(self, product: ProductItem) -> WorkflowResult:
49
+ """Runs the workflow."""
50
+ pass
51
+
52
+
53
+ class Processor:
54
+ """Processing product items for a set of classification workflows."""
55
+
56
+ def __init__(self, workflows: Sequence[Workflow]):
57
+ """Initializes the Processor.
58
+
59
+ Args:
60
+ workflows: Sequence of workflows for classification of product items.
61
+ """
62
+ if not self._are_unique(workflows=workflows):
63
+ raise ValueError(
64
+ f"Workflow names are not unique: {[wf.name for wf in workflows]}"
65
+ )
66
+ self._workflows = workflows
67
+
68
+ @staticmethod
69
+ def _are_unique(workflows: Sequence[Workflow]) -> bool:
70
+ """Tests if the workflows have unique names."""
71
+ return len(workflows) == len(set([wf.name for wf in workflows]))
72
+
73
+ async def run(self, product: ProductItem) -> ProductItem:
74
+ """Run the processing step for multiple workflows and return all results together with workflow.name.
75
+
76
+ Args:
77
+ product: The product item to process.
78
+ """
79
+ for wf in self._workflows:
80
+ try:
81
+ logger.info(
82
+ f'Running workflow="{wf.name}" for product with url="{product.url_resolved}".'
83
+ )
84
+ res = await wf.run(product=product)
85
+ except Exception:
86
+ logger.error(
87
+ f'Error while running workflow="{wf.name}" for product with url="{product.url_resolved}"',
88
+ exc_info=True,
89
+ )
90
+ continue
91
+
92
+ # Update the product item
93
+ inp_tok = out_tok = 0
94
+ if isinstance(res, ClassificationResult):
95
+ logger.debug(
96
+ f'result from workflow="{wf.name}" added to product.classifications'
97
+ )
98
+ product.classifications[wf.name] = int(res.result)
99
+ inp_tok = res.input_tokens
100
+ out_tok = res.output_tokens
101
+
102
+ elif isinstance(res, TmpResult):
103
+ logger.debug(f'result from workflow="{wf.name}" added to product.tmp')
104
+ product.tmp[wf.name] = res
105
+ inp_tok = res.input_tokens
106
+ out_tok = res.output_tokens
107
+
108
+ elif res is None:
109
+ logger.debug(
110
+ f'result from workflow="{wf.name}" is `None` and therefore not stored'
111
+ )
112
+
113
+ else:
114
+ logger.warning(
115
+ f'result from workflow="{wf.name}" return type={type(res)} is not allowed; '
116
+ f"must either be of type `ClassificationResult`, "
117
+ f"`TmpResult`, or `None`; not type={type(res)}"
118
+ )
119
+
120
+ if inp_tok > 0 or out_tok > 0:
121
+ logger.debug(
122
+ f'result from workflow="{wf.name}" used input_tokens={inp_tok}, output_tokens={out_tok}'
123
+ )
124
+ product.usage[wf.name] = {
125
+ "input_tokens": inp_tok,
126
+ "output_tokens": out_tok,
127
+ }
128
+
129
+ return product