datamarket 0.7.24.1__tar.gz → 0.7.112__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket-0.7.112/PKG-INFO +168 -0
- datamarket-0.7.112/pyproject.toml +155 -0
- datamarket-0.7.112/src/datamarket/exceptions/__init__.py +1 -0
- datamarket-0.7.112/src/datamarket/exceptions/main.py +53 -0
- datamarket-0.7.112/src/datamarket/interfaces/alchemy.py +374 -0
- datamarket-0.7.112/src/datamarket/interfaces/aws.py +127 -0
- datamarket-0.7.112/src/datamarket/interfaces/azure.py +135 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/interfaces/drive.py +60 -10
- datamarket-0.7.112/src/datamarket/interfaces/ftp.py +68 -0
- datamarket-0.7.112/src/datamarket/interfaces/nominatim.py +359 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/interfaces/peerdb.py +70 -23
- datamarket-0.7.112/src/datamarket/interfaces/proxy.py +366 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/interfaces/tinybird.py +4 -12
- datamarket-0.7.112/src/datamarket/params/__init__.py +0 -0
- datamarket-0.7.112/src/datamarket/params/nominatim.py +424 -0
- datamarket-0.7.112/src/datamarket/utils/__init__.py +1 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/utils/airflow.py +10 -7
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/utils/alchemy.py +2 -1
- datamarket-0.7.112/src/datamarket/utils/main.py +222 -0
- datamarket-0.7.112/src/datamarket/utils/nominatim.py +201 -0
- datamarket-0.7.112/src/datamarket/utils/playwright/__init__.py +0 -0
- datamarket-0.7.112/src/datamarket/utils/playwright/async_api.py +235 -0
- datamarket-0.7.112/src/datamarket/utils/playwright/sync_api.py +244 -0
- datamarket-0.7.112/src/datamarket/utils/requests.py +165 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/utils/selenium.py +6 -12
- datamarket-0.7.112/src/datamarket/utils/strings/__init__.py +1 -0
- datamarket-0.7.112/src/datamarket/utils/strings/normalization.py +217 -0
- datamarket-0.7.112/src/datamarket/utils/strings/obfuscation.py +153 -0
- datamarket-0.7.112/src/datamarket/utils/strings/standardization.py +40 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/utils/typer.py +2 -1
- datamarket-0.7.112/src/datamarket/utils/types.py +1 -0
- datamarket-0.7.24.1/PKG-INFO +0 -148
- datamarket-0.7.24.1/pyproject.toml +0 -129
- datamarket-0.7.24.1/src/datamarket/__init__.py +0 -1
- datamarket-0.7.24.1/src/datamarket/interfaces/alchemy.py +0 -111
- datamarket-0.7.24.1/src/datamarket/interfaces/aws.py +0 -60
- datamarket-0.7.24.1/src/datamarket/interfaces/ftp.py +0 -61
- datamarket-0.7.24.1/src/datamarket/interfaces/nominatim.py +0 -110
- datamarket-0.7.24.1/src/datamarket/interfaces/proxy.py +0 -93
- datamarket-0.7.24.1/src/datamarket/params/nominatim.py +0 -54
- datamarket-0.7.24.1/src/datamarket/utils/__init__.py +0 -1
- datamarket-0.7.24.1/src/datamarket/utils/main.py +0 -101
- {datamarket-0.7.24.1 → datamarket-0.7.112}/LICENSE +0 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/README.md +0 -0
- {datamarket-0.7.24.1/src/datamarket/interfaces → datamarket-0.7.112/src/datamarket}/__init__.py +0 -0
- {datamarket-0.7.24.1/src/datamarket/params → datamarket-0.7.112/src/datamarket/interfaces}/__init__.py +0 -0
- {datamarket-0.7.24.1 → datamarket-0.7.112}/src/datamarket/utils/soda.py +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datamarket
|
|
3
|
+
Version: 0.7.112
|
|
4
|
+
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
+
License: GPL-3.0-or-later
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: DataMarket
|
|
8
|
+
Author-email: techsupport@datamarket.es
|
|
9
|
+
Requires-Python: >=3.12,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Provides-Extra: aws
|
|
18
|
+
Provides-Extra: azure-storage-blob
|
|
19
|
+
Provides-Extra: boto3
|
|
20
|
+
Provides-Extra: camoufox
|
|
21
|
+
Provides-Extra: chompjs
|
|
22
|
+
Provides-Extra: click
|
|
23
|
+
Provides-Extra: clickhouse-driver
|
|
24
|
+
Provides-Extra: datetime
|
|
25
|
+
Provides-Extra: ddgs
|
|
26
|
+
Provides-Extra: demjson3
|
|
27
|
+
Provides-Extra: dnspython
|
|
28
|
+
Provides-Extra: drive
|
|
29
|
+
Provides-Extra: fake-useragent
|
|
30
|
+
Provides-Extra: geoalchemy2
|
|
31
|
+
Provides-Extra: geopandas
|
|
32
|
+
Provides-Extra: google-api-python-client
|
|
33
|
+
Provides-Extra: google-auth-httplib2
|
|
34
|
+
Provides-Extra: google-auth-oauthlib
|
|
35
|
+
Provides-Extra: html2text
|
|
36
|
+
Provides-Extra: httpx
|
|
37
|
+
Provides-Extra: json5
|
|
38
|
+
Provides-Extra: lxml
|
|
39
|
+
Provides-Extra: matplotlib
|
|
40
|
+
Provides-Extra: nodriver
|
|
41
|
+
Provides-Extra: openpyxl
|
|
42
|
+
Provides-Extra: pandarallel
|
|
43
|
+
Provides-Extra: pandas
|
|
44
|
+
Provides-Extra: pandera
|
|
45
|
+
Provides-Extra: peerdb
|
|
46
|
+
Provides-Extra: pii
|
|
47
|
+
Provides-Extra: pillow
|
|
48
|
+
Provides-Extra: playwright
|
|
49
|
+
Provides-Extra: playwright-stealth
|
|
50
|
+
Provides-Extra: plotly
|
|
51
|
+
Provides-Extra: pyarrow
|
|
52
|
+
Provides-Extra: pydrive2
|
|
53
|
+
Provides-Extra: pymupdf
|
|
54
|
+
Provides-Extra: pyproj
|
|
55
|
+
Provides-Extra: pyrate-limiter
|
|
56
|
+
Provides-Extra: pysocks
|
|
57
|
+
Provides-Extra: pyspark
|
|
58
|
+
Provides-Extra: pytest
|
|
59
|
+
Provides-Extra: retry
|
|
60
|
+
Provides-Extra: rnet
|
|
61
|
+
Provides-Extra: shapely
|
|
62
|
+
Provides-Extra: soda-core-mysql
|
|
63
|
+
Provides-Extra: soda-core-postgres
|
|
64
|
+
Provides-Extra: sqlparse
|
|
65
|
+
Provides-Extra: tqdm
|
|
66
|
+
Provides-Extra: undetected-chromedriver
|
|
67
|
+
Provides-Extra: xmltodict
|
|
68
|
+
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0)
|
|
69
|
+
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
70
|
+
Requires-Dist: babel (>=2.0.0,<3.0.0)
|
|
71
|
+
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
72
|
+
Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
73
|
+
Requires-Dist: browserforge (>=1.2.0,<2.0.0) ; extra == "camoufox"
|
|
74
|
+
Requires-Dist: camoufox[geoip] (>=0.4.11,<0.5.0) ; extra == "camoufox"
|
|
75
|
+
Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
|
|
76
|
+
Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
|
|
77
|
+
Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
78
|
+
Requires-Dist: croniter (>=3.0.0,<4.0.0)
|
|
79
|
+
Requires-Dist: datetime (>=5.0,<6.0) ; extra == "datetime"
|
|
80
|
+
Requires-Dist: ddgs (>=9.0.0,<10.0.0) ; extra == "ddgs"
|
|
81
|
+
Requires-Dist: demjson3 (>=3.0.0,<4.0.0) ; extra == "demjson3"
|
|
82
|
+
Requires-Dist: dnspython (>=2.0.0,<3.0.0) ; extra == "dnspython"
|
|
83
|
+
Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
|
|
84
|
+
Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
|
|
85
|
+
Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
|
|
86
|
+
Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
|
|
87
|
+
Requires-Dist: geopy (>=2.0.0,<3.0.0)
|
|
88
|
+
Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
|
|
89
|
+
Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
|
|
90
|
+
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
|
|
91
|
+
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
92
|
+
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
93
|
+
Requires-Dist: inflection (>=0.5.0,<0.6.0)
|
|
94
|
+
Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
|
|
95
|
+
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
96
|
+
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
97
|
+
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
98
|
+
Requires-Dist: matplotlib (>=3.0.0,<4.0.0) ; extra == "matplotlib"
|
|
99
|
+
Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
|
|
100
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
101
|
+
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
102
|
+
Requires-Dist: pandarallel (>=1.0.0,<2.0.0) ; extra == "pandarallel"
|
|
103
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
104
|
+
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
105
|
+
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
106
|
+
Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
|
|
107
|
+
Requires-Dist: playwright (==1.47.0) ; extra == "playwright" or extra == "camoufox"
|
|
108
|
+
Requires-Dist: plotly (>=6.0.0,<7.0.0) ; extra == "plotly"
|
|
109
|
+
Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
|
|
110
|
+
Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
111
|
+
Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
112
|
+
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
|
|
113
|
+
Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
114
|
+
Requires-Dist: pycountry (>=24.0.0,<25.0.0)
|
|
115
|
+
Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
|
|
116
|
+
Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
117
|
+
Requires-Dist: pyproj (>=3.0.0,<4.0.0) ; extra == "pyproj"
|
|
118
|
+
Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
|
|
119
|
+
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
120
|
+
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
121
|
+
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
122
|
+
Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
|
|
123
|
+
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0)
|
|
124
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
125
|
+
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
126
|
+
Requires-Dist: rnet (>=2.0.0,<3.0.0) ; extra == "rnet"
|
|
127
|
+
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
128
|
+
Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
129
|
+
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
130
|
+
Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
|
|
131
|
+
Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
|
|
132
|
+
Requires-Dist: sqlparse (>=0.5.0,<0.6.0) ; extra == "sqlparse"
|
|
133
|
+
Requires-Dist: stem (>=1.0.0,<2.0.0)
|
|
134
|
+
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
135
|
+
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
136
|
+
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
137
|
+
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
138
|
+
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
139
|
+
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
140
|
+
Project-URL: Homepage, https://datamarket.es
|
|
141
|
+
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
142
|
+
Description-Content-Type: text/markdown
|
|
143
|
+
|
|
144
|
+
# DataMarket scraping core
|
|
145
|
+
|
|
146
|
+
------------------------------------------------------
|
|
147
|
+
[](https://github.com/psf/black)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
Utilities that integrate advance scraping knowledge into just one library.
|
|
151
|
+
|
|
152
|
+
## Installation
|
|
153
|
+
|
|
154
|
+
To install this library in your Python environment:
|
|
155
|
+
|
|
156
|
+
`pip install datamarket`
|
|
157
|
+
|
|
158
|
+
## Documentation
|
|
159
|
+
|
|
160
|
+
This library has built functionalities for the following topics:
|
|
161
|
+
|
|
162
|
+
- **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
|
|
163
|
+
- **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
|
|
164
|
+
- **Tinybird**: a Python client for this popular API.
|
|
165
|
+
- **Drive**: functions to upload, delete or authenticate to Google Drive.
|
|
166
|
+
- **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
|
|
167
|
+
- **Selenium**: wrapper for the main Selenium functions.
|
|
168
|
+
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "datamarket"
|
|
3
|
+
version = "0.7.112"
|
|
4
|
+
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
|
+
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
|
+
license = "GPL-3.0-or-later"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
homepage = "https://datamarket.es"
|
|
9
|
+
repository = "https://github.com/Data-Market/datamarket"
|
|
10
|
+
documentation = "https://github.com/Data-Market/datamarket"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.poetry.dependencies]
|
|
18
|
+
python = "^3.12"
|
|
19
|
+
typer = "~0.15.0"
|
|
20
|
+
SQLAlchemy = "^2.0.0"
|
|
21
|
+
psycopg2-binary = "^2.0.0"
|
|
22
|
+
requests = "^2.0.0"
|
|
23
|
+
tenacity = "^9.0.0"
|
|
24
|
+
beautifulsoup4 = "^4.0.0"
|
|
25
|
+
pre-commit = "^4.0.0"
|
|
26
|
+
pendulum = "^3.0.0"
|
|
27
|
+
croniter = "^3.0.0"
|
|
28
|
+
dynaconf = "^3.0.0"
|
|
29
|
+
jinja2 = "^3.0.0"
|
|
30
|
+
inflection = "~0.5.0"
|
|
31
|
+
python-string-utils = "^1.0.0"
|
|
32
|
+
unidecode = "^1.0.0"
|
|
33
|
+
numpy = "^2.0.0"
|
|
34
|
+
pycountry = "^24.0.0"
|
|
35
|
+
geopy = "^2.0.0"
|
|
36
|
+
jellyfish = "^1.0.0"
|
|
37
|
+
stem = "^1.0.0"
|
|
38
|
+
babel = "^2.0.0"
|
|
39
|
+
rapidfuzz = "^3.0.0"
|
|
40
|
+
|
|
41
|
+
boto3 = { version = "~1.35.0", optional = true }
|
|
42
|
+
lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
|
|
43
|
+
tqdm = { version = "^4.0.0", optional = true }
|
|
44
|
+
pandas = { version = "^2.0.0", optional = true }
|
|
45
|
+
pyarrow = { version = "^19.0.0", optional = true }
|
|
46
|
+
pytest = { version = "^8.0.0", optional = true }
|
|
47
|
+
playwright = { version = "1.47.0", optional = true }
|
|
48
|
+
tf-playwright-stealth = { version = "^1.0.0", optional = true }
|
|
49
|
+
soda-core-postgres = { version = "^3.0.0", optional = true }
|
|
50
|
+
soda-core-mysql-utf8-hotfix = { version = "^3.0.0", optional = true }
|
|
51
|
+
fake-useragent = { version = "^2.0.0", optional = true }
|
|
52
|
+
pydrive2 = { version = "^1.0.0", optional = true }
|
|
53
|
+
clickhouse-driver = { version = "~0.2.0", optional = true }
|
|
54
|
+
click = { version = "^8.0.0", optional = true }
|
|
55
|
+
demjson3 = { version = "^3.0.0", optional = true }
|
|
56
|
+
nodriver = { version = "~0.44", optional = true }
|
|
57
|
+
retry = { version = "~0.9.0", optional = true }
|
|
58
|
+
shapely = { version = "^2.0.0", optional = true }
|
|
59
|
+
geopandas = { version = "^1.0.0", optional = true }
|
|
60
|
+
chompjs = { version = "^1.0.0", optional = true }
|
|
61
|
+
pillow = { version = "^11.0.0", optional = true }
|
|
62
|
+
ddgs = { version = "^9.0.0", optional = true }
|
|
63
|
+
pysocks = { version = "^1.0.0", optional = true }
|
|
64
|
+
xmltodict = { version = "~0.14.0", optional = true }
|
|
65
|
+
pymupdf = { version = "^1.0.0", optional = true }
|
|
66
|
+
html2text = { version = "^2024.0.0", optional = true }
|
|
67
|
+
pyspark = { version = "^3.0.0", optional = true }
|
|
68
|
+
pandera = { version = "~0.22.0", optional = true }
|
|
69
|
+
json5 = { version = "~0.10.0", optional = true }
|
|
70
|
+
geoalchemy2 = { version = "~0.17.0", optional = true }
|
|
71
|
+
datetime = { version = "^5.0", optional = true }
|
|
72
|
+
azure-storage-blob = { version = "^12.0.0", optional = true }
|
|
73
|
+
google-api-python-client = { version = "^2.0.0", optional = true }
|
|
74
|
+
google-auth-httplib2 = { version = "~0.2.0", optional = true }
|
|
75
|
+
google-auth-oauthlib = { version = "^1.0.0", optional = true }
|
|
76
|
+
dnspython = { version = "^2.0.0", optional = true }
|
|
77
|
+
openpyxl = { version = "^3.0.0", optional = true }
|
|
78
|
+
httpx = { extras = ["http2"], version = "~0.28.0", optional = true }
|
|
79
|
+
camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
|
|
80
|
+
browserforge = { version = "^1.2.0", optional = true }
|
|
81
|
+
presidio-analyzer = { version = "^2.0.0", optional = true, extras = [
|
|
82
|
+
"phonenumbers",
|
|
83
|
+
] }
|
|
84
|
+
presidio-anonymizer = { version = "^2.0.0", optional = true }
|
|
85
|
+
spacy = { version = "^3.0.0", optional = true }
|
|
86
|
+
spacy-langdetect = { version = "~0.1.0", optional = true }
|
|
87
|
+
pandarallel = { version = "^1.0.0", optional = true }
|
|
88
|
+
pyrate-limiter = { version = "^3.0.0", optional = true }
|
|
89
|
+
pyproj = { version = "^3.0.0", optional = true }
|
|
90
|
+
sqlparse = { version = "~0.5.0", optional = true }
|
|
91
|
+
rnet = { version = "^2.0.0", optional = true }
|
|
92
|
+
matplotlib = { version = "^3.0.0", optional = true }
|
|
93
|
+
plotly = { version = "^6.0.0", optional = true }
|
|
94
|
+
|
|
95
|
+
[tool.poetry.extras]
|
|
96
|
+
boto3 = ["boto3"]
|
|
97
|
+
lxml = ["lxml"]
|
|
98
|
+
tqdm = ["tqdm"]
|
|
99
|
+
pandas = ["pandas"]
|
|
100
|
+
pyarrow = ["pyarrow"]
|
|
101
|
+
pytest = ["pytest"]
|
|
102
|
+
playwright = ["playwright"]
|
|
103
|
+
playwright-stealth = ["playwright-stealth"]
|
|
104
|
+
soda-core-postgres = ["soda-core-postgres"]
|
|
105
|
+
soda-core-mysql = ["soda-core-mysql-utf8-hotfix"]
|
|
106
|
+
fake-useragent = ["fake-useragent"]
|
|
107
|
+
pydrive2 = ["pydrive2"]
|
|
108
|
+
clickhouse-driver = ["clickhouse-driver"]
|
|
109
|
+
click = ["click"]
|
|
110
|
+
demjson3 = ["demjson3"]
|
|
111
|
+
nodriver = ["nodriver"]
|
|
112
|
+
undetected-chromedriver = ["undetected-chromedriver"]
|
|
113
|
+
retry = ["retry"]
|
|
114
|
+
shapely = ["shapely"]
|
|
115
|
+
geopandas = ["geopandas"]
|
|
116
|
+
chompjs = ["chompjs"]
|
|
117
|
+
pillow = ["pillow"]
|
|
118
|
+
ddgs = ["ddgs"]
|
|
119
|
+
pysocks = ["pysocks"]
|
|
120
|
+
xmltodict = ["xmltodict"]
|
|
121
|
+
pymupdf = ["pymupdf"]
|
|
122
|
+
html2text = ["html2text"]
|
|
123
|
+
pyspark = ["pyspark"]
|
|
124
|
+
pandera = ["pandera"]
|
|
125
|
+
json5 = ["json5"]
|
|
126
|
+
geoalchemy2 = ["geoalchemy2"]
|
|
127
|
+
datetime = ["datetime"]
|
|
128
|
+
azure-storage-blob = ["azure-storage-blob"]
|
|
129
|
+
google-api-python-client = ["google-api-python-client"]
|
|
130
|
+
google-auth-httplib2 = ["google-auth-httplib2"]
|
|
131
|
+
google-auth-oauthlib = ["google-auth-oauthlib"]
|
|
132
|
+
dnspython = ["dnspython"]
|
|
133
|
+
openpyxl = ["openpyxl"]
|
|
134
|
+
httpx = ["httpx"]
|
|
135
|
+
camoufox = ["camoufox", "browserforge", "playwright"]
|
|
136
|
+
pandarallel = ["pandarallel"]
|
|
137
|
+
pyrate-limiter = ["pyrate-limiter"]
|
|
138
|
+
pyproj = ["pyproj"]
|
|
139
|
+
sqlparse = ["sqlparse"]
|
|
140
|
+
rnet = ["rnet"]
|
|
141
|
+
matplotlib = ["matplotlib"]
|
|
142
|
+
plotly = ["plotly"]
|
|
143
|
+
|
|
144
|
+
# Interface groups
|
|
145
|
+
aws = ["boto3"]
|
|
146
|
+
drive = ["pydrive2"]
|
|
147
|
+
peerdb = ["boto3", "clickhouse-driver"]
|
|
148
|
+
|
|
149
|
+
# Other groups
|
|
150
|
+
pii = ["presidio-analyzer", "presidio-anonymizer", "spacy", "spacy-langdetect"]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
[build-system]
|
|
154
|
+
requires = ["poetry-core>=1.0.0"]
|
|
155
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import * # noqa: F403
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# CLASSES
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RedirectionDetectedError(Exception):
|
|
9
|
+
def __init__(self, message="Redirection detected!"):
|
|
10
|
+
self.message = message
|
|
11
|
+
super().__init__(self.message)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NotFoundError(Exception):
|
|
15
|
+
def __init__(self, message="Not found!"):
|
|
16
|
+
self.message = message
|
|
17
|
+
super().__init__(self.message)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BadRequestError(Exception):
|
|
21
|
+
def __init__(self, message="Bad request!"):
|
|
22
|
+
self.message = message
|
|
23
|
+
super().__init__(self.message)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EmptyResponseError(Exception):
|
|
27
|
+
def __init__(self, message="Empty response!"):
|
|
28
|
+
self.message = message
|
|
29
|
+
super().__init__(self.message)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ManagedHTTPError(Exception):
|
|
33
|
+
"""Signal that this HTTP status was handled and should not be retried."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, response: requests.Response, *, url: str | None = None, message: str | None = None):
|
|
36
|
+
self.response = response
|
|
37
|
+
self.request = getattr(response, "request", None)
|
|
38
|
+
self.status_code = getattr(response, "status_code", None)
|
|
39
|
+
self.url = url or (self.request.url if self.request is not None else None)
|
|
40
|
+
self.message = message
|
|
41
|
+
super().__init__(message or f"HTTP {self.status_code} for {self.url}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class NoWorkingProxiesError(Exception):
|
|
45
|
+
def __init__(self, message="No working proxies available"):
|
|
46
|
+
self.message = message
|
|
47
|
+
super().__init__(self.message)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EnsureNewIPTimeoutError(Exception):
|
|
51
|
+
def __init__(self, message="Timed out waiting for new IP"):
|
|
52
|
+
self.message = message
|
|
53
|
+
super().__init__(self.message)
|