fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fraudcrawler/__init__.py +21 -5
- fraudcrawler/base/base.py +18 -38
- fraudcrawler/base/client.py +57 -60
- fraudcrawler/base/orchestrator.py +277 -276
- fraudcrawler/base/retry.py +25 -11
- fraudcrawler/launch_demo_pipeline.py +103 -41
- fraudcrawler/processing/base.py +151 -0
- fraudcrawler/processing/openai.py +521 -0
- fraudcrawler/scraping/enrich.py +6 -4
- fraudcrawler/scraping/search.py +370 -110
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +146 -80
- fraudcrawler/settings.py +22 -10
- fraudcrawler-0.7.26.dist-info/METADATA +173 -0
- fraudcrawler-0.7.26.dist-info/RECORD +23 -0
- fraudcrawler/processing/processor.py +0 -199
- fraudcrawler-0.5.0.dist-info/METADATA +0 -167
- fraudcrawler-0.5.0.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.5.0.dist-info → fraudcrawler-0.7.26.dist-info}/entry_points.txt +0 -0
fraudcrawler/__init__.py
CHANGED
|
@@ -1,8 +1,19 @@
|
|
|
1
|
-
from fraudcrawler.scraping.search import
|
|
1
|
+
from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.url import URLCollector
|
|
4
4
|
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
5
|
-
from fraudcrawler.processing.
|
|
5
|
+
from fraudcrawler.processing.base import (
|
|
6
|
+
UserInputs,
|
|
7
|
+
Workflow,
|
|
8
|
+
ClassificationResult,
|
|
9
|
+
TmpResult,
|
|
10
|
+
Processor,
|
|
11
|
+
)
|
|
12
|
+
from fraudcrawler.processing.openai import (
|
|
13
|
+
OpenAIWorkflow,
|
|
14
|
+
OpenAIClassification,
|
|
15
|
+
OpenAIClassificationUserInputs,
|
|
16
|
+
)
|
|
6
17
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
7
18
|
from fraudcrawler.base.client import FraudCrawlerClient
|
|
8
19
|
from fraudcrawler.base.base import (
|
|
@@ -11,17 +22,23 @@ from fraudcrawler.base.base import (
|
|
|
11
22
|
Host,
|
|
12
23
|
Language,
|
|
13
24
|
Location,
|
|
14
|
-
Prompt,
|
|
15
25
|
ProductItem,
|
|
16
26
|
HttpxAsyncClient,
|
|
17
27
|
)
|
|
18
28
|
|
|
19
29
|
__all__ = [
|
|
20
|
-
"
|
|
30
|
+
"Searcher",
|
|
21
31
|
"SearchEngineName",
|
|
22
32
|
"Enricher",
|
|
23
33
|
"URLCollector",
|
|
24
34
|
"ZyteAPI",
|
|
35
|
+
"UserInputs",
|
|
36
|
+
"Workflow",
|
|
37
|
+
"ClassificationResult",
|
|
38
|
+
"TmpResult",
|
|
39
|
+
"OpenAIWorkflow",
|
|
40
|
+
"OpenAIClassification",
|
|
41
|
+
"OpenAIClassificationUserInputs",
|
|
25
42
|
"Processor",
|
|
26
43
|
"Orchestrator",
|
|
27
44
|
"ProductItem",
|
|
@@ -31,6 +48,5 @@ __all__ = [
|
|
|
31
48
|
"Host",
|
|
32
49
|
"Deepness",
|
|
33
50
|
"Enrichment",
|
|
34
|
-
"Prompt",
|
|
35
51
|
"HttpxAsyncClient",
|
|
36
52
|
]
|
fraudcrawler/base/base.py
CHANGED
|
@@ -11,6 +11,7 @@ from urllib.parse import urlparse
|
|
|
11
11
|
import re
|
|
12
12
|
from typing import Any, Dict, List
|
|
13
13
|
|
|
14
|
+
|
|
14
15
|
import httpx
|
|
15
16
|
|
|
16
17
|
from fraudcrawler.settings import (
|
|
@@ -44,6 +45,7 @@ class Setup(BaseSettings):
|
|
|
44
45
|
dataforseo_pwd: str
|
|
45
46
|
zyteapi_key: str
|
|
46
47
|
openaiapi_key: str
|
|
48
|
+
pypy_token: str
|
|
47
49
|
|
|
48
50
|
class Config:
|
|
49
51
|
env_file = ".env"
|
|
@@ -69,14 +71,6 @@ class Host(BaseModel):
|
|
|
69
71
|
return [cls._normalize_domain(dom.strip()) for dom in val]
|
|
70
72
|
|
|
71
73
|
|
|
72
|
-
class ClassificationResult(BaseModel):
|
|
73
|
-
"""Model for classification results."""
|
|
74
|
-
|
|
75
|
-
result: int
|
|
76
|
-
input_tokens: int
|
|
77
|
-
output_tokens: int
|
|
78
|
-
|
|
79
|
-
|
|
80
74
|
class Location(BaseModel):
|
|
81
75
|
"""Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
|
|
82
76
|
|
|
@@ -132,25 +126,30 @@ class Deepness(BaseModel):
|
|
|
132
126
|
class ProductItem(BaseModel):
|
|
133
127
|
"""Model representing a product item."""
|
|
134
128
|
|
|
135
|
-
#
|
|
129
|
+
# Search parameters
|
|
136
130
|
search_term: str
|
|
137
131
|
search_term_type: str
|
|
138
132
|
url: str
|
|
139
133
|
url_resolved: str
|
|
140
134
|
search_engine_name: str
|
|
141
135
|
domain: str
|
|
136
|
+
exact_search: bool = False
|
|
137
|
+
exact_search_match: bool = False
|
|
142
138
|
|
|
143
|
-
#
|
|
139
|
+
# Context parameters
|
|
144
140
|
product_name: str | None = None
|
|
145
141
|
product_price: str | None = None
|
|
146
142
|
product_description: str | None = None
|
|
147
143
|
product_images: List[str] | None = None
|
|
144
|
+
product_gtin: str | None = None
|
|
148
145
|
probability: float | None = None
|
|
149
146
|
html: str | None = None
|
|
150
147
|
html_clean: str | None = None
|
|
151
148
|
|
|
152
|
-
# Processor parameters
|
|
149
|
+
# Processor parameters (set dynamically)
|
|
153
150
|
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
151
|
+
tmp: Dict[str, Any] = Field(default_factory=dict)
|
|
152
|
+
insights: Dict[str, Any] | None = Field(default=None)
|
|
154
153
|
|
|
155
154
|
# Usage parameters
|
|
156
155
|
usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
|
|
@@ -160,33 +159,6 @@ class ProductItem(BaseModel):
|
|
|
160
159
|
filtered_at_stage: str | None = None
|
|
161
160
|
|
|
162
161
|
|
|
163
|
-
class Prompt(BaseModel):
|
|
164
|
-
"""Model for prompts."""
|
|
165
|
-
|
|
166
|
-
name: str
|
|
167
|
-
system_prompt: str
|
|
168
|
-
product_item_fields: List[str]
|
|
169
|
-
allowed_classes: List[int]
|
|
170
|
-
|
|
171
|
-
@field_validator("allowed_classes", mode="before")
|
|
172
|
-
def check_for_positive_value(cls, val):
|
|
173
|
-
"""Check if all values are positive."""
|
|
174
|
-
if not all(isinstance(i, int) and i >= 0 for i in val):
|
|
175
|
-
raise ValueError("all values in allowed_classes must be positive integers.")
|
|
176
|
-
return val
|
|
177
|
-
|
|
178
|
-
@field_validator("product_item_fields", mode="before")
|
|
179
|
-
def validate_product_item_fields(cls, val):
|
|
180
|
-
"""Ensure all product_item_fields are valid ProductItem attributes."""
|
|
181
|
-
valid_fields = set(ProductItem.model_fields.keys())
|
|
182
|
-
for field in val:
|
|
183
|
-
if field not in valid_fields:
|
|
184
|
-
raise ValueError(
|
|
185
|
-
f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
|
|
186
|
-
)
|
|
187
|
-
return val
|
|
188
|
-
|
|
189
|
-
|
|
190
162
|
class HttpxAsyncClient(httpx.AsyncClient):
|
|
191
163
|
"""Httpx async client that can be used to retain the default settings."""
|
|
192
164
|
|
|
@@ -216,6 +188,14 @@ class DomainUtils:
|
|
|
216
188
|
"""
|
|
217
189
|
|
|
218
190
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
191
|
+
_headers = {
|
|
192
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
193
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
194
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
195
|
+
"Accept-Encoding": "gzip, deflate",
|
|
196
|
+
"Connection": "keep-alive",
|
|
197
|
+
"Upgrade-Insecure-Requests": "1",
|
|
198
|
+
}
|
|
219
199
|
|
|
220
200
|
def _get_domain(self, url: str) -> str:
|
|
221
201
|
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
fraudcrawler/base/client.py
CHANGED
|
@@ -4,22 +4,25 @@ from datetime import datetime
|
|
|
4
4
|
import logging
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from pydantic import BaseModel
|
|
7
|
-
from typing import List
|
|
7
|
+
from typing import List
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
11
11
|
from fraudcrawler.settings import ROOT_DIR
|
|
12
12
|
from fraudcrawler.base.base import (
|
|
13
|
-
Setup,
|
|
14
13
|
Language,
|
|
15
14
|
Location,
|
|
16
15
|
Deepness,
|
|
17
16
|
Host,
|
|
18
|
-
Prompt,
|
|
19
17
|
ProductItem,
|
|
20
18
|
)
|
|
21
19
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
22
|
-
from fraudcrawler.scraping.search import SearchEngineName
|
|
20
|
+
from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
21
|
+
from fraudcrawler.scraping.enrich import Enricher
|
|
22
|
+
from fraudcrawler.scraping.url import URLCollector
|
|
23
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
24
|
+
from fraudcrawler.processing.base import Processor
|
|
25
|
+
|
|
23
26
|
|
|
24
27
|
logger = logging.getLogger(__name__)
|
|
25
28
|
|
|
@@ -34,18 +37,38 @@ class Results(BaseModel):
|
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
class FraudCrawlerClient(Orchestrator):
|
|
37
|
-
"""The main client for FraudCrawler.
|
|
40
|
+
"""The main client for FraudCrawler product search and analysis.
|
|
41
|
+
|
|
42
|
+
This client orchestrates the complete pipeline: search, deduplication, context extraction,
|
|
43
|
+
processing (classification), and result collection. It inherits from Orchestrator and adds
|
|
44
|
+
result management and persistence functionality.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_FILENAME_TEMPLATE = "{search_term}_{language}_{location}_{timestamp}.csv"
|
|
38
48
|
|
|
39
|
-
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
searcher: Searcher,
|
|
52
|
+
enricher: Enricher,
|
|
53
|
+
url_collector: URLCollector,
|
|
54
|
+
zyteapi: ZyteAPI,
|
|
55
|
+
processor: Processor,
|
|
56
|
+
):
|
|
57
|
+
"""Initializes FraudCrawlerClient.
|
|
40
58
|
|
|
41
|
-
|
|
42
|
-
|
|
59
|
+
Args:
|
|
60
|
+
searcher: Client for searching step.
|
|
61
|
+
enricher: Client for enrichment step.
|
|
62
|
+
url_collector: Client for deduplication.
|
|
63
|
+
zyteapi: Client for metadata extraction.
|
|
64
|
+
processor: Client for product classification.
|
|
65
|
+
"""
|
|
43
66
|
super().__init__(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
67
|
+
searcher=searcher,
|
|
68
|
+
enricher=enricher,
|
|
69
|
+
url_collector=url_collector,
|
|
70
|
+
zyteapi=zyteapi,
|
|
71
|
+
processor=processor,
|
|
49
72
|
)
|
|
50
73
|
|
|
51
74
|
self._results_dir = _RESULTS_DIR
|
|
@@ -53,13 +76,6 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
53
76
|
self._results_dir.mkdir(parents=True)
|
|
54
77
|
self._results: List[Results] = []
|
|
55
78
|
|
|
56
|
-
async def __aenter__(self) -> Self:
|
|
57
|
-
await super().__aenter__() # let base set itself up
|
|
58
|
-
return self # so `async with FraudCrawlerClient()` gives you this instance
|
|
59
|
-
|
|
60
|
-
async def __aexit__(self, *args, **kwargs) -> None:
|
|
61
|
-
await super().__aexit__(*args, **kwargs)
|
|
62
|
-
|
|
63
79
|
async def _collect_results(
|
|
64
80
|
self, queue_in: asyncio.Queue[ProductItem | None]
|
|
65
81
|
) -> None:
|
|
@@ -80,45 +96,38 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
80
96
|
|
|
81
97
|
# Convert the list of products to a DataFrame
|
|
82
98
|
df = pd.json_normalize(products)
|
|
83
|
-
cols = [c.split(".")[-1] for c in df.columns]
|
|
84
|
-
if len(cols) != len(set(cols)):
|
|
85
|
-
logger.error("Duplicate columns after json_normalize.")
|
|
86
|
-
else:
|
|
87
|
-
df.columns = cols
|
|
88
99
|
|
|
89
100
|
# Save the DataFrame to a CSV file
|
|
90
101
|
filename = self._results[-1].filename
|
|
91
102
|
df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
|
|
92
103
|
logger.info(f"Results saved to {filename}")
|
|
93
104
|
|
|
94
|
-
def
|
|
105
|
+
async def run(
|
|
95
106
|
self,
|
|
96
107
|
search_term: str,
|
|
108
|
+
search_engines: List[SearchEngineName],
|
|
97
109
|
language: Language,
|
|
98
110
|
location: Location,
|
|
99
111
|
deepness: Deepness,
|
|
100
|
-
prompts: List[Prompt],
|
|
101
112
|
marketplaces: List[Host] | None = None,
|
|
102
113
|
excluded_urls: List[Host] | None = None,
|
|
103
|
-
search_engines: List[SearchEngineName | str] | None = None,
|
|
104
114
|
previously_collected_urls: List[str] | None = None,
|
|
105
115
|
) -> None:
|
|
106
|
-
"""Runs the pipeline steps:
|
|
116
|
+
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
107
117
|
|
|
108
118
|
Args:
|
|
109
119
|
search_term: The search term for the query.
|
|
120
|
+
search_engines: The list of search engines to use for the search query.
|
|
110
121
|
language: The language to use for the query.
|
|
111
122
|
location: The location to use for the query.
|
|
112
123
|
deepness: The search depth and enrichment details.
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
search_engines: The list of search engines to use for the search (optional).
|
|
117
|
-
previously_collected_urls: The urls that have been collected previously and are ignored (optional).
|
|
124
|
+
marketplaces: The marketplaces to include in the search.
|
|
125
|
+
excluded_urls: The URLs to exclude from the search.
|
|
126
|
+
previously_collected_urls: The urls that have been collected previously and are ignored.
|
|
118
127
|
"""
|
|
119
128
|
# Handle results files
|
|
120
129
|
timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
|
|
121
|
-
filename = self._results_dir / self.
|
|
130
|
+
filename = self._results_dir / self._FILENAME_TEMPLATE.format(
|
|
122
131
|
search_term=search_term,
|
|
123
132
|
language=language.code,
|
|
124
133
|
location=location.code,
|
|
@@ -126,31 +135,16 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
126
135
|
)
|
|
127
136
|
self._results.append(Results(search_term=search_term, filename=filename))
|
|
128
137
|
|
|
129
|
-
# Normalize inputs - convert strings to SearchEngineName enum values
|
|
130
|
-
nrm_search_engines = list(SearchEngineName)
|
|
131
|
-
if search_engines:
|
|
132
|
-
nrm_search_engines = [
|
|
133
|
-
SearchEngineName(se) if isinstance(se, str) else se
|
|
134
|
-
for se in search_engines
|
|
135
|
-
]
|
|
136
|
-
|
|
137
138
|
# Run the pipeline by calling the orchestrator's run method
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
location=location,
|
|
148
|
-
deepness=deepness,
|
|
149
|
-
prompts=prompts,
|
|
150
|
-
marketplaces=marketplaces,
|
|
151
|
-
excluded_urls=excluded_urls,
|
|
152
|
-
previously_collected_urls=previously_collected_urls,
|
|
153
|
-
)
|
|
139
|
+
await super().run(
|
|
140
|
+
search_term=search_term,
|
|
141
|
+
search_engines=search_engines,
|
|
142
|
+
language=language,
|
|
143
|
+
location=location,
|
|
144
|
+
deepness=deepness,
|
|
145
|
+
marketplaces=marketplaces,
|
|
146
|
+
excluded_urls=excluded_urls,
|
|
147
|
+
previously_collected_urls=previously_collected_urls,
|
|
154
148
|
)
|
|
155
149
|
|
|
156
150
|
def load_results(self, index: int = -1) -> pd.DataFrame:
|
|
@@ -161,7 +155,10 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
161
155
|
"""
|
|
162
156
|
|
|
163
157
|
results = self._results[index]
|
|
164
|
-
|
|
158
|
+
if (filename := results.filename) is None:
|
|
159
|
+
raise ValueError("filename not found (is None)")
|
|
160
|
+
|
|
161
|
+
return pd.read_csv(filename)
|
|
165
162
|
|
|
166
163
|
def print_available_results(self) -> None:
|
|
167
164
|
"""Prints the available results."""
|