fraudcrawler 0.5.0__py3-none-any.whl → 0.7.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fraudcrawler/__init__.py +21 -5
- fraudcrawler/base/base.py +18 -38
- fraudcrawler/base/client.py +57 -60
- fraudcrawler/base/orchestrator.py +277 -276
- fraudcrawler/base/retry.py +25 -11
- fraudcrawler/launch_demo_pipeline.py +103 -41
- fraudcrawler/processing/base.py +129 -0
- fraudcrawler/processing/openai.py +520 -0
- fraudcrawler/scraping/enrich.py +6 -4
- fraudcrawler/scraping/search.py +370 -110
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +146 -80
- fraudcrawler/settings.py +22 -10
- fraudcrawler-0.7.22.dist-info/METADATA +173 -0
- fraudcrawler-0.7.22.dist-info/RECORD +23 -0
- fraudcrawler/processing/processor.py +0 -199
- fraudcrawler-0.5.0.dist-info/METADATA +0 -167
- fraudcrawler-0.5.0.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.22.dist-info}/entry_points.txt +0 -0
fraudcrawler/base/retry.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from httpx import HTTPStatusError
|
|
2
1
|
from tenacity import (
|
|
3
2
|
AsyncRetrying,
|
|
4
3
|
retry_if_exception,
|
|
@@ -17,24 +16,39 @@ from fraudcrawler.settings import (
|
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
def _is_retryable_exception(err: BaseException) -> bool:
|
|
20
|
-
if
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
)
|
|
19
|
+
"""Checks if failing HTTP connection is worth to be re-tried."""
|
|
20
|
+
|
|
21
|
+
# Get status_code from err
|
|
22
|
+
response = getattr(err, "response", None)
|
|
23
|
+
if response is not None:
|
|
24
|
+
status_code = getattr(response, "status_code", None)
|
|
25
|
+
else:
|
|
26
|
+
status_code = getattr(err, "status_code", None)
|
|
27
|
+
|
|
28
|
+
# Check if we skip retry
|
|
29
|
+
if status_code is not None and status_code in RETRY_SKIP_IF_CODE:
|
|
24
30
|
return False
|
|
31
|
+
|
|
32
|
+
# Else we do try it again
|
|
25
33
|
return True
|
|
26
34
|
|
|
27
35
|
|
|
28
|
-
def get_async_retry(
|
|
36
|
+
def get_async_retry(
|
|
37
|
+
stop_after: int = RETRY_STOP_AFTER_ATTEMPT,
|
|
38
|
+
initial_delay: int = RETRY_INITIAL_DELAY,
|
|
39
|
+
max_delay: int = RETRY_MAX_DELAY,
|
|
40
|
+
exp_base: int = RETRY_EXP_BASE,
|
|
41
|
+
jitter: int = RETRY_JITTER,
|
|
42
|
+
) -> AsyncRetrying:
|
|
29
43
|
"""returns the retry configuration for async operations."""
|
|
30
44
|
return AsyncRetrying(
|
|
31
45
|
retry=retry_if_exception(_is_retryable_exception),
|
|
32
|
-
stop=stop_after_attempt(
|
|
46
|
+
stop=stop_after_attempt(stop_after),
|
|
33
47
|
wait=wait_exponential_jitter(
|
|
34
|
-
initial=
|
|
35
|
-
max=
|
|
36
|
-
exp_base=
|
|
37
|
-
jitter=
|
|
48
|
+
initial=initial_delay,
|
|
49
|
+
max=max_delay,
|
|
50
|
+
exp_base=exp_base,
|
|
51
|
+
jitter=jitter,
|
|
38
52
|
),
|
|
39
53
|
reraise=True,
|
|
40
54
|
)
|
|
@@ -1,55 +1,84 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import logging
|
|
3
|
+
from typing import Sequence
|
|
2
4
|
|
|
3
|
-
from fraudcrawler import
|
|
5
|
+
from fraudcrawler.base.base import Setup
|
|
6
|
+
from fraudcrawler import (
|
|
7
|
+
FraudCrawlerClient,
|
|
8
|
+
HttpxAsyncClient,
|
|
9
|
+
Searcher,
|
|
10
|
+
Enricher,
|
|
11
|
+
URLCollector,
|
|
12
|
+
ZyteAPI,
|
|
13
|
+
SearchEngineName,
|
|
14
|
+
Language,
|
|
15
|
+
Location,
|
|
16
|
+
Deepness,
|
|
17
|
+
Processor,
|
|
18
|
+
Workflow,
|
|
19
|
+
OpenAIClassification,
|
|
20
|
+
)
|
|
4
21
|
|
|
5
22
|
LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
|
|
6
23
|
LOG_LVL = "INFO"
|
|
7
24
|
DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
25
|
+
SETUP = Setup() # type: ignore[call-arg]
|
|
8
26
|
logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
|
|
9
27
|
|
|
10
28
|
|
|
11
|
-
def
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
29
|
+
def _setup_workflows(http_client: HttpxAsyncClient) -> Sequence[Workflow]:
|
|
30
|
+
"""Sets up the set of workflows to be run iteratively."""
|
|
31
|
+
_AVAILABILITY_SYSTEM_PROMPT = (
|
|
32
|
+
"You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
|
|
33
|
+
"Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
|
|
34
|
+
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
35
|
+
"If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
|
|
36
|
+
"if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
|
|
37
|
+
"Respond only with the number 1 or 0."
|
|
38
|
+
)
|
|
39
|
+
_SERIOUSNESS_SYSTEM_PROMPT = (
|
|
40
|
+
"You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
|
|
41
|
+
"Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
|
|
42
|
+
" 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
|
|
43
|
+
"within an online shop or marketplace.\n"
|
|
44
|
+
" 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
|
|
45
|
+
" - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
|
|
46
|
+
"exact product itself, classify as 0.\n"
|
|
47
|
+
" - Advertisements: Promotional content that doesn't directly sell a product.\n"
|
|
48
|
+
" - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
|
|
49
|
+
" - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
|
|
50
|
+
"Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
|
|
51
|
+
)
|
|
52
|
+
return [
|
|
53
|
+
OpenAIClassification(
|
|
54
|
+
http_client=http_client,
|
|
21
55
|
name="availability",
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
|
|
25
|
-
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
26
|
-
"If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
|
|
27
|
-
"if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
|
|
28
|
-
"Respond only with the number 1 or 0."
|
|
29
|
-
),
|
|
56
|
+
api_key=SETUP.openaiapi_key,
|
|
57
|
+
model="gpt-4o",
|
|
30
58
|
product_item_fields=["product_name", "html_clean"],
|
|
59
|
+
system_prompt=_AVAILABILITY_SYSTEM_PROMPT,
|
|
60
|
+
allowed_classes=[0, 1],
|
|
61
|
+
),
|
|
62
|
+
OpenAIClassification(
|
|
63
|
+
http_client=http_client,
|
|
64
|
+
name="seriousness",
|
|
65
|
+
api_key=SETUP.openaiapi_key,
|
|
66
|
+
model="gpt-4o",
|
|
67
|
+
product_item_fields=["product_name", "product_description"],
|
|
68
|
+
system_prompt=_SERIOUSNESS_SYSTEM_PROMPT,
|
|
31
69
|
allowed_classes=[0, 1],
|
|
32
70
|
),
|
|
33
|
-
# Prompt(
|
|
34
|
-
# name="seriousness",
|
|
35
|
-
# system_prompt=(
|
|
36
|
-
# "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
|
|
37
|
-
# "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
|
|
38
|
-
# " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
|
|
39
|
-
# "within an online shop or marketplace.\n"
|
|
40
|
-
# " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
|
|
41
|
-
# " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
|
|
42
|
-
# "exact product itself, classify as 0.\n"
|
|
43
|
-
# " - Advertisements: Promotional content that doesn't directly sell a product.\n"
|
|
44
|
-
# " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
|
|
45
|
-
# " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
|
|
46
|
-
# "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
|
|
47
|
-
# ),
|
|
48
|
-
# product_item_fields=["product_name", "product_description"],
|
|
49
|
-
# allowed_classes=[0, 1],
|
|
50
|
-
# ),
|
|
51
71
|
]
|
|
52
|
-
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def run(http_client: HttpxAsyncClient, search_term: str):
|
|
75
|
+
# Setup the search
|
|
76
|
+
search_engines = list(SearchEngineName)
|
|
77
|
+
language = Language(name="German")
|
|
78
|
+
location = Location(name="Switzerland")
|
|
79
|
+
deepness = Deepness(num_results=10)
|
|
80
|
+
|
|
81
|
+
# # Optional: Add term enrichment
|
|
53
82
|
# from fraudcrawler import Enrichment
|
|
54
83
|
|
|
55
84
|
# deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
|
|
@@ -66,13 +95,41 @@ def search(search_term: str):
|
|
|
66
95
|
Host(name="Brack", domains="brack.ch"),
|
|
67
96
|
]
|
|
68
97
|
|
|
98
|
+
# Setup clients
|
|
99
|
+
searcher = Searcher(
|
|
100
|
+
http_client=http_client,
|
|
101
|
+
serpapi_key=SETUP.serpapi_key,
|
|
102
|
+
zyteapi_key=SETUP.zyteapi_key,
|
|
103
|
+
)
|
|
104
|
+
enricher = Enricher(
|
|
105
|
+
http_client=http_client,
|
|
106
|
+
user=SETUP.dataforseo_user,
|
|
107
|
+
pwd=SETUP.dataforseo_pwd,
|
|
108
|
+
)
|
|
109
|
+
url_collector = URLCollector()
|
|
110
|
+
zyteapi = ZyteAPI(
|
|
111
|
+
http_client=http_client,
|
|
112
|
+
api_key=SETUP.zyteapi_key,
|
|
113
|
+
)
|
|
114
|
+
workflows = _setup_workflows(http_client=http_client)
|
|
115
|
+
processor = Processor(workflows=workflows)
|
|
116
|
+
|
|
117
|
+
# Setup the client
|
|
118
|
+
client = FraudCrawlerClient(
|
|
119
|
+
searcher=searcher,
|
|
120
|
+
enricher=enricher,
|
|
121
|
+
url_collector=url_collector,
|
|
122
|
+
zyteapi=zyteapi,
|
|
123
|
+
processor=processor,
|
|
124
|
+
)
|
|
125
|
+
|
|
69
126
|
# Execute the pipeline
|
|
70
|
-
client.
|
|
127
|
+
await client.run(
|
|
71
128
|
search_term=search_term,
|
|
129
|
+
search_engines=search_engines,
|
|
72
130
|
language=language,
|
|
73
131
|
location=location,
|
|
74
132
|
deepness=deepness,
|
|
75
|
-
prompts=prompts,
|
|
76
133
|
# marketplaces=marketplaces,
|
|
77
134
|
excluded_urls=excluded_urls,
|
|
78
135
|
)
|
|
@@ -96,5 +153,10 @@ def search(search_term: str):
|
|
|
96
153
|
print()
|
|
97
154
|
|
|
98
155
|
|
|
156
|
+
async def main(search_term: str):
|
|
157
|
+
async with HttpxAsyncClient() as http_client:
|
|
158
|
+
await run(http_client=http_client, search_term=search_term)
|
|
159
|
+
|
|
160
|
+
|
|
99
161
|
if __name__ == "__main__":
|
|
100
|
-
|
|
162
|
+
asyncio.run(main(search_term="Kaffeebohnen"))
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import logging
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Any, Dict, List, Sequence, TypeAlias
|
|
5
|
+
|
|
6
|
+
from fraudcrawler.base.base import ProductItem
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
UserInputs: TypeAlias = Dict[str, List[str]]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ClassificationResult(BaseModel):
|
|
15
|
+
"""Model for classification results."""
|
|
16
|
+
|
|
17
|
+
result: int
|
|
18
|
+
input_tokens: int = 0
|
|
19
|
+
output_tokens: int = 0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TmpResult(BaseModel):
|
|
23
|
+
"""Model for tmp results."""
|
|
24
|
+
|
|
25
|
+
result: Any
|
|
26
|
+
input_tokens: int = 0
|
|
27
|
+
output_tokens: int = 0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
WorkflowResult: TypeAlias = ClassificationResult | TmpResult | None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Workflow(ABC):
|
|
34
|
+
"""Abstract base class for independent processing workflows."""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
name: str,
|
|
39
|
+
):
|
|
40
|
+
"""Abstract base class for defining a classification workflow.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
name: Name of the classification workflow.
|
|
44
|
+
"""
|
|
45
|
+
self.name = name
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
async def run(self, product: ProductItem) -> WorkflowResult:
|
|
49
|
+
"""Runs the workflow."""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Processor:
|
|
54
|
+
"""Processing product items for a set of classification workflows."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, workflows: Sequence[Workflow]):
|
|
57
|
+
"""Initializes the Processor.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
workflows: Sequence of workflows for classification of product items.
|
|
61
|
+
"""
|
|
62
|
+
if not self._are_unique(workflows=workflows):
|
|
63
|
+
raise ValueError(
|
|
64
|
+
f"Workflow names are not unique: {[wf.name for wf in workflows]}"
|
|
65
|
+
)
|
|
66
|
+
self._workflows = workflows
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _are_unique(workflows: Sequence[Workflow]) -> bool:
|
|
70
|
+
"""Tests if the workflows have unique names."""
|
|
71
|
+
return len(workflows) == len(set([wf.name for wf in workflows]))
|
|
72
|
+
|
|
73
|
+
async def run(self, product: ProductItem) -> ProductItem:
|
|
74
|
+
"""Run the processing step for multiple workflows and return all results together with workflow.name.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
product: The product item to process.
|
|
78
|
+
"""
|
|
79
|
+
for wf in self._workflows:
|
|
80
|
+
try:
|
|
81
|
+
logger.info(
|
|
82
|
+
f'Running workflow="{wf.name}" for product with url="{product.url_resolved}".'
|
|
83
|
+
)
|
|
84
|
+
res = await wf.run(product=product)
|
|
85
|
+
except Exception:
|
|
86
|
+
logger.error(
|
|
87
|
+
f'Error while running workflow="{wf.name}" for product with url="{product.url_resolved}"',
|
|
88
|
+
exc_info=True,
|
|
89
|
+
)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Update the product item
|
|
93
|
+
inp_tok = out_tok = 0
|
|
94
|
+
if isinstance(res, ClassificationResult):
|
|
95
|
+
logger.debug(
|
|
96
|
+
f'result from workflow="{wf.name}" added to product.classifications'
|
|
97
|
+
)
|
|
98
|
+
product.classifications[wf.name] = int(res.result)
|
|
99
|
+
inp_tok = res.input_tokens
|
|
100
|
+
out_tok = res.output_tokens
|
|
101
|
+
|
|
102
|
+
elif isinstance(res, TmpResult):
|
|
103
|
+
logger.debug(f'result from workflow="{wf.name}" added to product.tmp')
|
|
104
|
+
product.tmp[wf.name] = res
|
|
105
|
+
inp_tok = res.input_tokens
|
|
106
|
+
out_tok = res.output_tokens
|
|
107
|
+
|
|
108
|
+
elif res is None:
|
|
109
|
+
logger.debug(
|
|
110
|
+
f'result from workflow="{wf.name}" is `None` and therefore not stored'
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
logger.warning(
|
|
115
|
+
f'result from workflow="{wf.name}" return type={type(res)} is not allowed; '
|
|
116
|
+
f"must either be of type `ClassificationResult`, "
|
|
117
|
+
f"`TmpResult`, or `None`; not type={type(res)}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if inp_tok > 0 or out_tok > 0:
|
|
121
|
+
logger.debug(
|
|
122
|
+
f'result from workflow="{wf.name}" used input_tokens={inp_tok}, output_tokens={out_tok}'
|
|
123
|
+
)
|
|
124
|
+
product.usage[wf.name] = {
|
|
125
|
+
"input_tokens": inp_tok,
|
|
126
|
+
"output_tokens": out_tok,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return product
|