fraudcrawler 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +2 -2
- fraudcrawler/base/base.py +4 -38
- fraudcrawler/base/client.py +1 -1
- fraudcrawler/base/orchestrator.py +135 -135
- fraudcrawler/base/retry.py +12 -6
- fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler/processing/processor.py +3 -3
- fraudcrawler/scraping/search.py +293 -74
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +15 -1
- fraudcrawler/settings.py +13 -3
- {fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/METADATA +4 -3
- fraudcrawler-0.6.0.dist-info/RECORD +22 -0
- {fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/WHEEL +1 -1
- fraudcrawler-0.5.8.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.8.dist-info → fraudcrawler-0.6.0.dist-info}/entry_points.txt +0 -0
fraudcrawler/settings.py
CHANGED
|
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
|
|
|
14
14
|
RETRY_JITTER = 1
|
|
15
15
|
RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# Search settings
|
|
18
18
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
19
19
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
20
20
|
SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
21
21
|
# ".com",
|
|
22
22
|
]
|
|
23
|
+
TOPPREISE_SEARCH_PATHS = {
|
|
24
|
+
"de": "produktsuche",
|
|
25
|
+
"fr": "chercher",
|
|
26
|
+
"default": "browse",
|
|
27
|
+
}
|
|
28
|
+
TOPPREISE_COMPARISON_PATHS = [
|
|
29
|
+
"preisvergleich",
|
|
30
|
+
"comparison-prix",
|
|
31
|
+
"price-comparison",
|
|
32
|
+
]
|
|
23
33
|
|
|
24
34
|
# URL De-duplication settings
|
|
25
35
|
KNOWN_TRACKERS = [
|
|
@@ -76,8 +86,8 @@ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevan
|
|
|
76
86
|
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
77
87
|
|
|
78
88
|
# Async workers settings
|
|
79
|
-
|
|
80
|
-
|
|
89
|
+
DEFAULT_N_SRCH_WKRS = 5
|
|
90
|
+
DEFAULT_N_CNTX_WKRS = 15
|
|
81
91
|
DEFAULT_N_PROC_WKRS = 10
|
|
82
92
|
|
|
83
93
|
# HTTPX client settings
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
15
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
15
16
|
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
17
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
|
|
|
160
161
|
### Async Setup
|
|
161
162
|
The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
|
|
162
163
|
|
|
163
|
-
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `
|
|
164
|
+
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
164
165
|
|
|
165
166
|
The following image provides a schematic representation of the package's async setup.
|
|
166
167
|

|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
|
|
2
|
+
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
fraudcrawler/base/base.py,sha256=IbkPookAAkqDCztzAvVRnhh8rCsYGlY69eI6cw-Kiw0,7294
|
|
4
|
+
fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
|
|
5
|
+
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
+
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
|
|
8
|
+
fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
|
|
9
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
|
|
10
|
+
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
|
|
12
|
+
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
+
fraudcrawler/scraping/search.py,sha256=pMjTQEewa-jP6l2ndhHy8CNIcO4svhZOm6N_LNuv3gs,33925
|
|
15
|
+
fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
|
|
16
|
+
fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
|
|
17
|
+
fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
|
|
18
|
+
fraudcrawler-0.6.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
+
fraudcrawler-0.6.0.dist-info/METADATA,sha256=adpYLe_ToSth-YOZE3eh-KNUsNmcwcM_SE7pqKikNmU,6704
|
|
20
|
+
fraudcrawler-0.6.0.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
|
21
|
+
fraudcrawler-0.6.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
+
fraudcrawler-0.6.0.dist-info/RECORD,,
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
|
|
2
|
-
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
fraudcrawler/base/base.py,sha256=NOJC12qw-iSkHScPnxFLfzUvg0w57qGaID6OAzHRXeo,8695
|
|
4
|
-
fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
|
|
5
|
-
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
-
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=28X45XLPlJe2hvff8HTLo-V08LNeS0zMWBHe5W3hk4c,27039
|
|
8
|
-
fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
|
|
9
|
-
fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
|
|
10
|
-
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
|
|
12
|
-
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
-
fraudcrawler/scraping/search.py,sha256=ZjxOj95ih6o6bOWA0JnBwjFlMzGS-8Sb1P-yvHI5aO0,24957
|
|
15
|
-
fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
|
|
16
|
-
fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
|
|
17
|
-
fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
|
|
18
|
-
fraudcrawler-0.5.8.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
-
fraudcrawler-0.5.8.dist-info/METADATA,sha256=-e9xqpIk0EjO6fqwhmQZ5gsDrl6eJKU7VQdp8MeN0R4,6642
|
|
20
|
-
fraudcrawler-0.5.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
-
fraudcrawler-0.5.8.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
-
fraudcrawler-0.5.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|