datamarket 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/peerdb.py +8 -34
- datamarket-0.9.4.dist-info/METADATA +144 -0
- {datamarket-0.9.3.dist-info → datamarket-0.9.4.dist-info}/RECORD +5 -5
- datamarket-0.9.3.dist-info/METADATA +0 -149
- {datamarket-0.9.3.dist-info → datamarket-0.9.4.dist-info}/LICENSE +0 -0
- {datamarket-0.9.3.dist-info → datamarket-0.9.4.dist-info}/WHEEL +0 -0
datamarket/interfaces/peerdb.py
CHANGED
|
@@ -216,52 +216,26 @@ class TransientS3:
|
|
|
216
216
|
self.config = section
|
|
217
217
|
self.bucket_name = self.config["bucket"]
|
|
218
218
|
self.session = boto3.Session(profile_name=self.config["profile"])
|
|
219
|
-
self.
|
|
219
|
+
self.s3_resource = self.session.resource("s3")
|
|
220
220
|
self.credentials = self.session.get_credentials()
|
|
221
221
|
self.access_key = self.credentials.access_key
|
|
222
222
|
self.secret_key = self.credentials.secret_key
|
|
223
223
|
self.region_name = self.session.region_name
|
|
224
|
-
self.endpoint_url = self.
|
|
224
|
+
self.endpoint_url = self.s3_resource.meta.endpoint_url
|
|
225
225
|
else:
|
|
226
226
|
logger.warning("no peerdb.s3 section in config")
|
|
227
227
|
|
|
228
228
|
def delete_paths_with_schema(self, schema_name):
|
|
229
229
|
logger.info(f"Deleting paths containing '{schema_name}' from S3")
|
|
230
230
|
|
|
231
|
-
|
|
232
|
-
pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
|
|
231
|
+
bucket = self.s3_resource.Bucket(self.bucket_name)
|
|
233
232
|
|
|
234
|
-
for
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
folder = prefix["Prefix"]
|
|
238
|
-
if schema_name in folder:
|
|
239
|
-
self._delete_folder_contents(folder)
|
|
233
|
+
for prefix in [schema_name, f"clone_{schema_name}"]:
|
|
234
|
+
objects_to_delete = bucket.objects.filter(Prefix=prefix)
|
|
235
|
+
objects_to_delete.delete()
|
|
240
236
|
|
|
241
237
|
logger.info(f"Deleted paths containing '{schema_name}' from S3")
|
|
242
238
|
|
|
243
|
-
def _delete_folder_contents(self, folder):
|
|
244
|
-
logger.info(f"Deleting contents of folder: {folder}")
|
|
245
|
-
|
|
246
|
-
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
247
|
-
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
|
|
248
|
-
|
|
249
|
-
delete_us = dict(Objects=[])
|
|
250
|
-
for page in pages:
|
|
251
|
-
if "Contents" in page:
|
|
252
|
-
for obj in page["Contents"]:
|
|
253
|
-
delete_us["Objects"].append(dict(Key=obj["Key"]))
|
|
254
|
-
|
|
255
|
-
# AWS limits to deleting 1000 objects at a time
|
|
256
|
-
if len(delete_us["Objects"]) >= 1000:
|
|
257
|
-
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
258
|
-
delete_us = dict(Objects=[])
|
|
259
|
-
|
|
260
|
-
if len(delete_us["Objects"]):
|
|
261
|
-
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
262
|
-
|
|
263
|
-
logger.info(f"Deleted contents of folder: {folder}")
|
|
264
|
-
|
|
265
239
|
|
|
266
240
|
class PeerDBInterface:
|
|
267
241
|
def __init__(self, config):
|
|
@@ -308,11 +282,11 @@ class PeerDBInterface:
|
|
|
308
282
|
if not self.docker_host_mapping or not host:
|
|
309
283
|
return host
|
|
310
284
|
|
|
311
|
-
if host in [
|
|
285
|
+
if host in ["localhost", "127.0.0.1"]:
|
|
312
286
|
logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
|
|
313
287
|
return self.docker_host_mapping
|
|
314
288
|
|
|
315
|
-
url_pattern = r
|
|
289
|
+
url_pattern = r"(localhost|127\.0\.0\.1)"
|
|
316
290
|
match = re.search(url_pattern, host)
|
|
317
291
|
if match:
|
|
318
292
|
original_host = match.group(1)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: datamarket
|
|
3
|
+
Version: 0.9.4
|
|
4
|
+
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
+
License: GPL-3.0-or-later
|
|
6
|
+
Author: DataMarket
|
|
7
|
+
Author-email: techsupport@datamarket.es
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Provides-Extra: alchemy
|
|
16
|
+
Provides-Extra: aws
|
|
17
|
+
Provides-Extra: azure-storage-blob
|
|
18
|
+
Provides-Extra: boto3
|
|
19
|
+
Provides-Extra: chompjs
|
|
20
|
+
Provides-Extra: click
|
|
21
|
+
Provides-Extra: clickhouse-driver
|
|
22
|
+
Provides-Extra: datetime
|
|
23
|
+
Provides-Extra: demjson3
|
|
24
|
+
Provides-Extra: dnspython
|
|
25
|
+
Provides-Extra: drive
|
|
26
|
+
Provides-Extra: duckduckgo-search
|
|
27
|
+
Provides-Extra: fake-useragent
|
|
28
|
+
Provides-Extra: geoalchemy2
|
|
29
|
+
Provides-Extra: geopandas
|
|
30
|
+
Provides-Extra: geopy
|
|
31
|
+
Provides-Extra: google-api-python-client
|
|
32
|
+
Provides-Extra: google-auth-httplib2
|
|
33
|
+
Provides-Extra: google-auth-oauthlib
|
|
34
|
+
Provides-Extra: html2text
|
|
35
|
+
Provides-Extra: httpx
|
|
36
|
+
Provides-Extra: json5
|
|
37
|
+
Provides-Extra: lxml
|
|
38
|
+
Provides-Extra: nodriver
|
|
39
|
+
Provides-Extra: openpyxl
|
|
40
|
+
Provides-Extra: pandas
|
|
41
|
+
Provides-Extra: pandera
|
|
42
|
+
Provides-Extra: peerdb
|
|
43
|
+
Provides-Extra: pillow
|
|
44
|
+
Provides-Extra: playwright
|
|
45
|
+
Provides-Extra: playwright-stealth
|
|
46
|
+
Provides-Extra: proxy
|
|
47
|
+
Provides-Extra: pyarrow
|
|
48
|
+
Provides-Extra: pydrive2
|
|
49
|
+
Provides-Extra: pymupdf
|
|
50
|
+
Provides-Extra: pysocks
|
|
51
|
+
Provides-Extra: pyspark
|
|
52
|
+
Provides-Extra: pytest
|
|
53
|
+
Provides-Extra: rapidfuzz
|
|
54
|
+
Provides-Extra: retry
|
|
55
|
+
Provides-Extra: shapely
|
|
56
|
+
Provides-Extra: soda-core-mysql
|
|
57
|
+
Provides-Extra: soda-core-postgres
|
|
58
|
+
Provides-Extra: stem
|
|
59
|
+
Provides-Extra: tqdm
|
|
60
|
+
Provides-Extra: undetected-chromedriver
|
|
61
|
+
Provides-Extra: unidecode
|
|
62
|
+
Provides-Extra: xmltodict
|
|
63
|
+
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
|
|
64
|
+
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
65
|
+
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
66
|
+
Requires-Dist: boto3 (>=1.0.0,<2.0.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
67
|
+
Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
|
|
68
|
+
Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
|
|
69
|
+
Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
70
|
+
Requires-Dist: croniter (>=3.0.0,<4.0.0)
|
|
71
|
+
Requires-Dist: datetime (>=5.0,<6.0) ; extra == "datetime"
|
|
72
|
+
Requires-Dist: demjson3 (>=3.0.0,<4.0.0) ; extra == "demjson3"
|
|
73
|
+
Requires-Dist: dnspython (>=2.0.0,<3.0.0) ; extra == "dnspython"
|
|
74
|
+
Requires-Dist: duckduckgo-search (>=7.0.0,<8.0.0) ; extra == "duckduckgo-search"
|
|
75
|
+
Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
|
|
76
|
+
Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
|
|
77
|
+
Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
|
|
78
|
+
Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
|
|
79
|
+
Requires-Dist: geopy (>=2.0.0,<3.0.0) ; extra == "geopy"
|
|
80
|
+
Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
|
|
81
|
+
Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
|
|
82
|
+
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
|
|
83
|
+
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
84
|
+
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
85
|
+
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
86
|
+
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
87
|
+
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
88
|
+
Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
|
|
89
|
+
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
90
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
91
|
+
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
92
|
+
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
93
|
+
Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
|
|
94
|
+
Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
|
|
95
|
+
Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
|
|
96
|
+
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
|
|
97
|
+
Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
98
|
+
Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
|
|
99
|
+
Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
100
|
+
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
101
|
+
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
102
|
+
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
103
|
+
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
|
|
104
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
105
|
+
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
106
|
+
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
107
|
+
Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
108
|
+
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
109
|
+
Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
|
|
110
|
+
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
111
|
+
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
112
|
+
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
113
|
+
Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
|
|
114
|
+
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
115
|
+
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
116
|
+
Project-URL: Homepage, https://datamarket.es
|
|
117
|
+
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
118
|
+
Description-Content-Type: text/markdown
|
|
119
|
+
|
|
120
|
+
# DataMarket scraping core
|
|
121
|
+
|
|
122
|
+
------------------------------------------------------
|
|
123
|
+
[](https://github.com/psf/black)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
Utilities that integrate advance scraping knowledge into just one library.
|
|
127
|
+
|
|
128
|
+
## Installation
|
|
129
|
+
|
|
130
|
+
To install this library in your Python environment:
|
|
131
|
+
|
|
132
|
+
`pip install datamarket`
|
|
133
|
+
|
|
134
|
+
## Documentation
|
|
135
|
+
|
|
136
|
+
This library has built functionalities for the following topics:
|
|
137
|
+
|
|
138
|
+
- **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
|
|
139
|
+
- **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
|
|
140
|
+
- **Tinybird**: a Python client for this popular API.
|
|
141
|
+
- **Drive**: functions to upload, delete or authenticate to Google Drive.
|
|
142
|
+
- **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
|
|
143
|
+
- **Selenium**: wrapper for the main Selenium functions.
|
|
144
|
+
|
|
@@ -5,7 +5,7 @@ datamarket/interfaces/aws.py,sha256=R6lYdSCD6a4g9l6aFMtNDt_EX3kroe2untDhgy7XG1k,
|
|
|
5
5
|
datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
|
|
6
6
|
datamarket/interfaces/ftp.py,sha256=Owk3D7tiF47_ZFT3Dc9h4_BaPsWtcJUbhagjpQB19q8,1900
|
|
7
7
|
datamarket/interfaces/nominatim.py,sha256=_gFJ04D-ju5xn3wuaGT5Pj5jhf4F5eINpxOpuQL_dIQ,3664
|
|
8
|
-
datamarket/interfaces/peerdb.py,sha256=
|
|
8
|
+
datamarket/interfaces/peerdb.py,sha256=FhBLJfR2EMT9Rsnj_OJXvC14E5OlXGsMrPUQ1AQlwPY,20717
|
|
9
9
|
datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
|
|
10
10
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
11
11
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -17,7 +17,7 @@ datamarket/utils/main.py,sha256=0Abt3ww1VSPnX4AVKDcYzqDLAOEV_54iUHMLJfre2bg,6129
|
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
19
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
20
|
-
datamarket-0.9.
|
|
21
|
-
datamarket-0.9.
|
|
22
|
-
datamarket-0.9.
|
|
23
|
-
datamarket-0.9.
|
|
20
|
+
datamarket-0.9.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
21
|
+
datamarket-0.9.4.dist-info/METADATA,sha256=YYuxN--M3y9MX62_hG5Y1piS2TBBo_fl6MfJeLyyOZA,6360
|
|
22
|
+
datamarket-0.9.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
23
|
+
datamarket-0.9.4.dist-info/RECORD,,
|
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: datamarket
|
|
3
|
-
Version: 0.9.3
|
|
4
|
-
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
-
License: GPL-3.0-or-later
|
|
6
|
-
Author: DataMarket
|
|
7
|
-
Author-email: techsupport@datamarket.es
|
|
8
|
-
Requires-Python: >=3.9,<4.0
|
|
9
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
-
Provides-Extra: alchemy
|
|
19
|
-
Provides-Extra: aws
|
|
20
|
-
Provides-Extra: azure-storage-blob
|
|
21
|
-
Provides-Extra: boto3
|
|
22
|
-
Provides-Extra: chompjs
|
|
23
|
-
Provides-Extra: click
|
|
24
|
-
Provides-Extra: clickhouse-driver
|
|
25
|
-
Provides-Extra: datetime
|
|
26
|
-
Provides-Extra: demjson3
|
|
27
|
-
Provides-Extra: dnspython
|
|
28
|
-
Provides-Extra: drive
|
|
29
|
-
Provides-Extra: duckduckgo-search
|
|
30
|
-
Provides-Extra: fake-useragent
|
|
31
|
-
Provides-Extra: geoalchemy2
|
|
32
|
-
Provides-Extra: geopandas
|
|
33
|
-
Provides-Extra: geopy
|
|
34
|
-
Provides-Extra: google-api-python-client
|
|
35
|
-
Provides-Extra: google-auth-httplib2
|
|
36
|
-
Provides-Extra: google-auth-oauthlib
|
|
37
|
-
Provides-Extra: html2text
|
|
38
|
-
Provides-Extra: httpx
|
|
39
|
-
Provides-Extra: json5
|
|
40
|
-
Provides-Extra: lxml
|
|
41
|
-
Provides-Extra: nodriver
|
|
42
|
-
Provides-Extra: openpyxl
|
|
43
|
-
Provides-Extra: pandas
|
|
44
|
-
Provides-Extra: pandera
|
|
45
|
-
Provides-Extra: peerdb
|
|
46
|
-
Provides-Extra: pillow
|
|
47
|
-
Provides-Extra: playwright
|
|
48
|
-
Provides-Extra: playwright-stealth
|
|
49
|
-
Provides-Extra: proxy
|
|
50
|
-
Provides-Extra: pyarrow
|
|
51
|
-
Provides-Extra: pydrive2
|
|
52
|
-
Provides-Extra: pymupdf
|
|
53
|
-
Provides-Extra: pysocks
|
|
54
|
-
Provides-Extra: pyspark
|
|
55
|
-
Provides-Extra: pytest
|
|
56
|
-
Provides-Extra: rapidfuzz
|
|
57
|
-
Provides-Extra: retry
|
|
58
|
-
Provides-Extra: shapely
|
|
59
|
-
Provides-Extra: soda-core-mysql
|
|
60
|
-
Provides-Extra: soda-core-postgres
|
|
61
|
-
Provides-Extra: stem
|
|
62
|
-
Provides-Extra: tqdm
|
|
63
|
-
Provides-Extra: undetected-chromedriver
|
|
64
|
-
Provides-Extra: unidecode
|
|
65
|
-
Provides-Extra: xmltodict
|
|
66
|
-
Requires-Dist: SQLAlchemy (==2.0.36) ; extra == "alchemy"
|
|
67
|
-
Requires-Dist: azure-storage-blob (==12.23.1) ; extra == "azure-storage-blob"
|
|
68
|
-
Requires-Dist: beautifulsoup4 (==4.12.3)
|
|
69
|
-
Requires-Dist: boto3 (==1.35.53) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
70
|
-
Requires-Dist: chompjs (==1.3.0) ; extra == "chompjs"
|
|
71
|
-
Requires-Dist: click (==8.1.7) ; extra == "click"
|
|
72
|
-
Requires-Dist: clickhouse-driver (==0.2.9) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
73
|
-
Requires-Dist: croniter (==3.0.4)
|
|
74
|
-
Requires-Dist: datetime (==5.5) ; extra == "datetime"
|
|
75
|
-
Requires-Dist: demjson3 (==3.0.6) ; extra == "demjson3"
|
|
76
|
-
Requires-Dist: dnspython (==2.7.0) ; extra == "dnspython"
|
|
77
|
-
Requires-Dist: duckduckgo-search (==6.2.11b1) ; extra == "duckduckgo-search"
|
|
78
|
-
Requires-Dist: dynaconf (==3.2.6)
|
|
79
|
-
Requires-Dist: fake-useragent (==1.5.1) ; extra == "fake-useragent"
|
|
80
|
-
Requires-Dist: geoalchemy2 (==0.15.2) ; extra == "geoalchemy2"
|
|
81
|
-
Requires-Dist: geopandas (==1.0.1) ; extra == "geopandas"
|
|
82
|
-
Requires-Dist: geopy (==2.4.1) ; extra == "geopy"
|
|
83
|
-
Requires-Dist: google-api-python-client (==2.151.0) ; extra == "google-api-python-client"
|
|
84
|
-
Requires-Dist: google-auth-httplib2 (==0.2.0) ; extra == "google-auth-httplib2"
|
|
85
|
-
Requires-Dist: google-auth-oauthlib (==1.2.1) ; extra == "google-auth-oauthlib"
|
|
86
|
-
Requires-Dist: html2text (==2024.2.26) ; extra == "html2text"
|
|
87
|
-
Requires-Dist: httpx[http2] (==0.28.1) ; extra == "httpx"
|
|
88
|
-
Requires-Dist: jinja2 (==3.1.5)
|
|
89
|
-
Requires-Dist: json5 (==0.9.25) ; extra == "json5"
|
|
90
|
-
Requires-Dist: lxml[html-clean] (==5.3.0) ; extra == "lxml"
|
|
91
|
-
Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
|
|
92
|
-
Requires-Dist: openpyxl (==3.1.5) ; extra == "openpyxl"
|
|
93
|
-
Requires-Dist: pandas (==2.2.3) ; extra == "pandas"
|
|
94
|
-
Requires-Dist: pandera (==0.20.4) ; extra == "pandera"
|
|
95
|
-
Requires-Dist: pendulum (==3.0.0)
|
|
96
|
-
Requires-Dist: pillow (==11.0.0) ; extra == "pillow"
|
|
97
|
-
Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
|
|
98
|
-
Requires-Dist: playwright-stealth (==1.0.6) ; extra == "playwright-stealth"
|
|
99
|
-
Requires-Dist: pre-commit (==4.0.1)
|
|
100
|
-
Requires-Dist: psycopg2-binary (==2.9.10)
|
|
101
|
-
Requires-Dist: pyarrow (==17.0.0) ; extra == "pyarrow"
|
|
102
|
-
Requires-Dist: pydrive2 (==1.20.0) ; extra == "pydrive2" or extra == "drive"
|
|
103
|
-
Requires-Dist: pymupdf (==1.24.13) ; extra == "pymupdf"
|
|
104
|
-
Requires-Dist: pysocks (==1.7.1) ; extra == "pysocks"
|
|
105
|
-
Requires-Dist: pyspark (==3.5.3) ; extra == "pyspark"
|
|
106
|
-
Requires-Dist: pytest (==8.3.3) ; extra == "pytest"
|
|
107
|
-
Requires-Dist: rapidfuzz (==3.10.1) ; extra == "rapidfuzz"
|
|
108
|
-
Requires-Dist: requests (==2.32.3)
|
|
109
|
-
Requires-Dist: retry (==0.9.2) ; extra == "retry"
|
|
110
|
-
Requires-Dist: shapely (==2.0.6) ; extra == "shapely"
|
|
111
|
-
Requires-Dist: soda-core-mysql (==3.4.4) ; extra == "soda-core-mysql"
|
|
112
|
-
Requires-Dist: soda-core-postgres (==3.4.1) ; extra == "soda-core-postgres"
|
|
113
|
-
Requires-Dist: stem (==1.8.2) ; extra == "stem" or extra == "proxy"
|
|
114
|
-
Requires-Dist: tenacity (==9.0.0)
|
|
115
|
-
Requires-Dist: tqdm (==4.66.6) ; extra == "tqdm"
|
|
116
|
-
Requires-Dist: typer (==0.12.5)
|
|
117
|
-
Requires-Dist: undetected-chromedriver (==3.5.5) ; extra == "undetected-chromedriver"
|
|
118
|
-
Requires-Dist: unidecode (==1.3.8) ; extra == "unidecode"
|
|
119
|
-
Requires-Dist: xmltodict (==0.14.2) ; extra == "xmltodict"
|
|
120
|
-
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
121
|
-
Project-URL: Homepage, https://datamarket.es
|
|
122
|
-
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
123
|
-
Description-Content-Type: text/markdown
|
|
124
|
-
|
|
125
|
-
# DataMarket scraping core
|
|
126
|
-
|
|
127
|
-
------------------------------------------------------
|
|
128
|
-
[](https://github.com/psf/black)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
Utilities that integrate advance scraping knowledge into just one library.
|
|
132
|
-
|
|
133
|
-
## Installation
|
|
134
|
-
|
|
135
|
-
To install this library in your Python environment:
|
|
136
|
-
|
|
137
|
-
`pip install datamarket`
|
|
138
|
-
|
|
139
|
-
## Documentation
|
|
140
|
-
|
|
141
|
-
This library has built functionalities for the following topics:
|
|
142
|
-
|
|
143
|
-
- **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
|
|
144
|
-
- **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
|
|
145
|
-
- **Tinybird**: a Python client for this popular API.
|
|
146
|
-
- **Drive**: functions to upload, delete or authenticate to Google Drive.
|
|
147
|
-
- **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
|
|
148
|
-
- **Selenium**: wrapper for the main Selenium functions.
|
|
149
|
-
|
|
File without changes
|
|
File without changes
|