fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fraudcrawler/__init__.py CHANGED
@@ -1,8 +1,19 @@
1
- from fraudcrawler.scraping.search import Search, SearchEngineName
1
+ from fraudcrawler.scraping.search import Searcher, SearchEngineName
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.url import URLCollector
4
4
  from fraudcrawler.scraping.zyte import ZyteAPI
5
- from fraudcrawler.processing.processor import Processor
5
+ from fraudcrawler.processing.base import (
6
+ UserInputs,
7
+ Workflow,
8
+ ClassificationResult,
9
+ TmpResult,
10
+ Processor,
11
+ )
12
+ from fraudcrawler.processing.openai import (
13
+ OpenAIWorkflow,
14
+ OpenAIClassification,
15
+ OpenAIClassificationUserInputs,
16
+ )
6
17
  from fraudcrawler.base.orchestrator import Orchestrator
7
18
  from fraudcrawler.base.client import FraudCrawlerClient
8
19
  from fraudcrawler.base.base import (
@@ -11,17 +22,23 @@ from fraudcrawler.base.base import (
11
22
  Host,
12
23
  Language,
13
24
  Location,
14
- Prompt,
15
25
  ProductItem,
16
26
  HttpxAsyncClient,
17
27
  )
18
28
 
19
29
  __all__ = [
20
- "Search",
30
+ "Searcher",
21
31
  "SearchEngineName",
22
32
  "Enricher",
23
33
  "URLCollector",
24
34
  "ZyteAPI",
35
+ "UserInputs",
36
+ "Workflow",
37
+ "ClassificationResult",
38
+ "TmpResult",
39
+ "OpenAIWorkflow",
40
+ "OpenAIClassification",
41
+ "OpenAIClassificationUserInputs",
25
42
  "Processor",
26
43
  "Orchestrator",
27
44
  "ProductItem",
@@ -31,6 +48,5 @@ __all__ = [
31
48
  "Host",
32
49
  "Deepness",
33
50
  "Enrichment",
34
- "Prompt",
35
51
  "HttpxAsyncClient",
36
52
  ]
fraudcrawler/base/base.py CHANGED
@@ -11,6 +11,7 @@ from urllib.parse import urlparse
11
11
  import re
12
12
  from typing import Any, Dict, List
13
13
 
14
+
14
15
  import httpx
15
16
 
16
17
  from fraudcrawler.settings import (
@@ -44,6 +45,7 @@ class Setup(BaseSettings):
44
45
  dataforseo_pwd: str
45
46
  zyteapi_key: str
46
47
  openaiapi_key: str
48
+ pypy_token: str
47
49
 
48
50
  class Config:
49
51
  env_file = ".env"
@@ -69,14 +71,6 @@ class Host(BaseModel):
69
71
  return [cls._normalize_domain(dom.strip()) for dom in val]
70
72
 
71
73
 
72
- class ClassificationResult(BaseModel):
73
- """Model for classification results."""
74
-
75
- result: int
76
- input_tokens: int
77
- output_tokens: int
78
-
79
-
80
74
  class Location(BaseModel):
81
75
  """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
82
76
 
@@ -132,25 +126,30 @@ class Deepness(BaseModel):
132
126
  class ProductItem(BaseModel):
133
127
  """Model representing a product item."""
134
128
 
135
- # Serp/Enrich parameters
129
+ # Search parameters
136
130
  search_term: str
137
131
  search_term_type: str
138
132
  url: str
139
133
  url_resolved: str
140
134
  search_engine_name: str
141
135
  domain: str
136
+ exact_search: bool = False
137
+ exact_search_match: bool = False
142
138
 
143
- # Zyte parameters
139
+ # Context parameters
144
140
  product_name: str | None = None
145
141
  product_price: str | None = None
146
142
  product_description: str | None = None
147
143
  product_images: List[str] | None = None
144
+ product_gtin: str | None = None
148
145
  probability: float | None = None
149
146
  html: str | None = None
150
147
  html_clean: str | None = None
151
148
 
152
- # Processor parameters are set dynamic so we must allow extra fields
149
+ # Processor parameters (set dynamically)
153
150
  classifications: Dict[str, int] = Field(default_factory=dict)
151
+ tmp: Dict[str, Any] = Field(default_factory=dict)
152
+ insights: Dict[str, Any] | None = Field(default=None)
154
153
 
155
154
  # Usage parameters
156
155
  usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
@@ -160,33 +159,6 @@ class ProductItem(BaseModel):
160
159
  filtered_at_stage: str | None = None
161
160
 
162
161
 
163
- class Prompt(BaseModel):
164
- """Model for prompts."""
165
-
166
- name: str
167
- system_prompt: str
168
- product_item_fields: List[str]
169
- allowed_classes: List[int]
170
-
171
- @field_validator("allowed_classes", mode="before")
172
- def check_for_positive_value(cls, val):
173
- """Check if all values are positive."""
174
- if not all(isinstance(i, int) and i >= 0 for i in val):
175
- raise ValueError("all values in allowed_classes must be positive integers.")
176
- return val
177
-
178
- @field_validator("product_item_fields", mode="before")
179
- def validate_product_item_fields(cls, val):
180
- """Ensure all product_item_fields are valid ProductItem attributes."""
181
- valid_fields = set(ProductItem.model_fields.keys())
182
- for field in val:
183
- if field not in valid_fields:
184
- raise ValueError(
185
- f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
186
- )
187
- return val
188
-
189
-
190
162
  class HttpxAsyncClient(httpx.AsyncClient):
191
163
  """Httpx async client that can be used to retain the default settings."""
192
164
 
@@ -216,6 +188,14 @@ class DomainUtils:
216
188
  """
217
189
 
218
190
  _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
191
+ _headers = {
192
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
193
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
194
+ "Accept-Language": "en-US,en;q=0.5",
195
+ "Accept-Encoding": "gzip, deflate",
196
+ "Connection": "keep-alive",
197
+ "Upgrade-Insecure-Requests": "1",
198
+ }
219
199
 
220
200
  def _get_domain(self, url: str) -> str:
221
201
  """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
@@ -4,22 +4,25 @@ from datetime import datetime
4
4
  import logging
5
5
  from pathlib import Path
6
6
  from pydantic import BaseModel
7
- from typing import List, Self
7
+ from typing import List
8
8
 
9
9
  import pandas as pd
10
10
 
11
11
  from fraudcrawler.settings import ROOT_DIR
12
12
  from fraudcrawler.base.base import (
13
- Setup,
14
13
  Language,
15
14
  Location,
16
15
  Deepness,
17
16
  Host,
18
- Prompt,
19
17
  ProductItem,
20
18
  )
21
19
  from fraudcrawler.base.orchestrator import Orchestrator
22
- from fraudcrawler.scraping.search import SearchEngineName
20
+ from fraudcrawler.scraping.search import Searcher, SearchEngineName
21
+ from fraudcrawler.scraping.enrich import Enricher
22
+ from fraudcrawler.scraping.url import URLCollector
23
+ from fraudcrawler.scraping.zyte import ZyteAPI
24
+ from fraudcrawler.processing.base import Processor
25
+
23
26
 
24
27
  logger = logging.getLogger(__name__)
25
28
 
@@ -34,18 +37,38 @@ class Results(BaseModel):
34
37
 
35
38
 
36
39
  class FraudCrawlerClient(Orchestrator):
37
- """The main client for FraudCrawler."""
40
+ """The main client for FraudCrawler product search and analysis.
41
+
42
+ This client orchestrates the complete pipeline: search, deduplication, context extraction,
43
+ processing (classification), and result collection. It inherits from Orchestrator and adds
44
+ result management and persistence functionality.
45
+ """
46
+
47
+ _FILENAME_TEMPLATE = "{search_term}_{language}_{location}_{timestamp}.csv"
38
48
 
39
- _filename_template = "{search_term}_{language}_{location}_{timestamp}.csv"
49
+ def __init__(
50
+ self,
51
+ searcher: Searcher,
52
+ enricher: Enricher,
53
+ url_collector: URLCollector,
54
+ zyteapi: ZyteAPI,
55
+ processor: Processor,
56
+ ):
57
+ """Initializes FraudCrawlerClient.
40
58
 
41
- def __init__(self):
42
- setup = Setup()
59
+ Args:
60
+ searcher: Client for searching step.
61
+ enricher: Client for enrichment step.
62
+ url_collector: Client for deduplication.
63
+ zyteapi: Client for metadata extraction.
64
+ processor: Client for product classification.
65
+ """
43
66
  super().__init__(
44
- serpapi_key=setup.serpapi_key,
45
- dataforseo_user=setup.dataforseo_user,
46
- dataforseo_pwd=setup.dataforseo_pwd,
47
- zyteapi_key=setup.zyteapi_key,
48
- openaiapi_key=setup.openaiapi_key,
67
+ searcher=searcher,
68
+ enricher=enricher,
69
+ url_collector=url_collector,
70
+ zyteapi=zyteapi,
71
+ processor=processor,
49
72
  )
50
73
 
51
74
  self._results_dir = _RESULTS_DIR
@@ -53,13 +76,6 @@ class FraudCrawlerClient(Orchestrator):
53
76
  self._results_dir.mkdir(parents=True)
54
77
  self._results: List[Results] = []
55
78
 
56
- async def __aenter__(self) -> Self:
57
- await super().__aenter__() # let base set itself up
58
- return self # so `async with FraudCrawlerClient()` gives you this instance
59
-
60
- async def __aexit__(self, *args, **kwargs) -> None:
61
- await super().__aexit__(*args, **kwargs)
62
-
63
79
  async def _collect_results(
64
80
  self, queue_in: asyncio.Queue[ProductItem | None]
65
81
  ) -> None:
@@ -80,45 +96,38 @@ class FraudCrawlerClient(Orchestrator):
80
96
 
81
97
  # Convert the list of products to a DataFrame
82
98
  df = pd.json_normalize(products)
83
- cols = [c.split(".")[-1] for c in df.columns]
84
- if len(cols) != len(set(cols)):
85
- logger.error("Duplicate columns after json_normalize.")
86
- else:
87
- df.columns = cols
88
99
 
89
100
  # Save the DataFrame to a CSV file
90
101
  filename = self._results[-1].filename
91
102
  df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
92
103
  logger.info(f"Results saved to {filename}")
93
104
 
94
- def execute(
105
+ async def run(
95
106
  self,
96
107
  search_term: str,
108
+ search_engines: List[SearchEngineName],
97
109
  language: Language,
98
110
  location: Location,
99
111
  deepness: Deepness,
100
- prompts: List[Prompt],
101
112
  marketplaces: List[Host] | None = None,
102
113
  excluded_urls: List[Host] | None = None,
103
- search_engines: List[SearchEngineName | str] | None = None,
104
114
  previously_collected_urls: List[str] | None = None,
105
115
  ) -> None:
106
- """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
116
+ """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
107
117
 
108
118
  Args:
109
119
  search_term: The search term for the query.
120
+ search_engines: The list of search engines to use for the search query.
110
121
  language: The language to use for the query.
111
122
  location: The location to use for the query.
112
123
  deepness: The search depth and enrichment details.
113
- prompts: The list of prompts to use for classification.
114
- marketplaces: The marketplaces to include in the search (optional).
115
- excluded_urls: The URLs to exclude from the search (optional).
116
- search_engines: The list of search engines to use for the search (optional).
117
- previously_collected_urls: The urls that have been collected previously and are ignored (optional).
124
+ marketplaces: The marketplaces to include in the search.
125
+ excluded_urls: The URLs to exclude from the search.
126
+ previously_collected_urls: The urls that have been collected previously and are ignored.
118
127
  """
119
128
  # Handle results files
120
129
  timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
121
- filename = self._results_dir / self._filename_template.format(
130
+ filename = self._results_dir / self._FILENAME_TEMPLATE.format(
122
131
  search_term=search_term,
123
132
  language=language.code,
124
133
  location=location.code,
@@ -126,31 +135,16 @@ class FraudCrawlerClient(Orchestrator):
126
135
  )
127
136
  self._results.append(Results(search_term=search_term, filename=filename))
128
137
 
129
- # Normalize inputs - convert strings to SearchEngineName enum values
130
- nrm_search_engines = list(SearchEngineName)
131
- if search_engines:
132
- nrm_search_engines = [
133
- SearchEngineName(se) if isinstance(se, str) else se
134
- for se in search_engines
135
- ]
136
-
137
138
  # Run the pipeline by calling the orchestrator's run method
138
- async def _run(*args, **kwargs):
139
- async with self:
140
- return await super(FraudCrawlerClient, self).run(*args, **kwargs)
141
-
142
- asyncio.run(
143
- _run(
144
- search_term=search_term,
145
- search_engines=nrm_search_engines,
146
- language=language,
147
- location=location,
148
- deepness=deepness,
149
- prompts=prompts,
150
- marketplaces=marketplaces,
151
- excluded_urls=excluded_urls,
152
- previously_collected_urls=previously_collected_urls,
153
- )
139
+ await super().run(
140
+ search_term=search_term,
141
+ search_engines=search_engines,
142
+ language=language,
143
+ location=location,
144
+ deepness=deepness,
145
+ marketplaces=marketplaces,
146
+ excluded_urls=excluded_urls,
147
+ previously_collected_urls=previously_collected_urls,
154
148
  )
155
149
 
156
150
  def load_results(self, index: int = -1) -> pd.DataFrame:
@@ -161,7 +155,10 @@ class FraudCrawlerClient(Orchestrator):
161
155
  """
162
156
 
163
157
  results = self._results[index]
164
- return pd.read_csv(results.filename)
158
+ if (filename := results.filename) is None:
159
+ raise ValueError("filename not found (is None)")
160
+
161
+ return pd.read_csv(filename)
165
162
 
166
163
  def print_available_results(self) -> None:
167
164
  """Prints the available results."""