datamarket 0.9.2__tar.gz → 0.9.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (25) hide show
  1. datamarket-0.9.4/PKG-INFO +144 -0
  2. datamarket-0.9.4/pyproject.toml +129 -0
  3. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/aws.py +9 -7
  4. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/ftp.py +3 -1
  5. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/peerdb.py +8 -34
  6. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/main.py +16 -7
  7. datamarket-0.9.2/PKG-INFO +0 -149
  8. datamarket-0.9.2/pyproject.toml +0 -130
  9. {datamarket-0.9.2 → datamarket-0.9.4}/LICENSE +0 -0
  10. {datamarket-0.9.2 → datamarket-0.9.4}/README.md +0 -0
  11. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/__init__.py +0 -0
  12. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/__init__.py +0 -0
  13. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/alchemy.py +0 -0
  14. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/drive.py +0 -0
  15. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/nominatim.py +0 -0
  16. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/proxy.py +0 -0
  17. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/params/nominatim.py +0 -0
  20. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/__init__.py +0 -0
  21. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/airflow.py +0 -0
  22. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/alchemy.py +0 -0
  23. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/selenium.py +0 -0
  24. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/soda.py +0 -0
  25. {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/typer.py +0 -0
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.3
2
+ Name: datamarket
3
+ Version: 0.9.4
4
+ Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
+ License: GPL-3.0-or-later
6
+ Author: DataMarket
7
+ Author-email: techsupport@datamarket.es
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Provides-Extra: alchemy
16
+ Provides-Extra: aws
17
+ Provides-Extra: azure-storage-blob
18
+ Provides-Extra: boto3
19
+ Provides-Extra: chompjs
20
+ Provides-Extra: click
21
+ Provides-Extra: clickhouse-driver
22
+ Provides-Extra: datetime
23
+ Provides-Extra: demjson3
24
+ Provides-Extra: dnspython
25
+ Provides-Extra: drive
26
+ Provides-Extra: duckduckgo-search
27
+ Provides-Extra: fake-useragent
28
+ Provides-Extra: geoalchemy2
29
+ Provides-Extra: geopandas
30
+ Provides-Extra: geopy
31
+ Provides-Extra: google-api-python-client
32
+ Provides-Extra: google-auth-httplib2
33
+ Provides-Extra: google-auth-oauthlib
34
+ Provides-Extra: html2text
35
+ Provides-Extra: httpx
36
+ Provides-Extra: json5
37
+ Provides-Extra: lxml
38
+ Provides-Extra: nodriver
39
+ Provides-Extra: openpyxl
40
+ Provides-Extra: pandas
41
+ Provides-Extra: pandera
42
+ Provides-Extra: peerdb
43
+ Provides-Extra: pillow
44
+ Provides-Extra: playwright
45
+ Provides-Extra: playwright-stealth
46
+ Provides-Extra: proxy
47
+ Provides-Extra: pyarrow
48
+ Provides-Extra: pydrive2
49
+ Provides-Extra: pymupdf
50
+ Provides-Extra: pysocks
51
+ Provides-Extra: pyspark
52
+ Provides-Extra: pytest
53
+ Provides-Extra: rapidfuzz
54
+ Provides-Extra: retry
55
+ Provides-Extra: shapely
56
+ Provides-Extra: soda-core-mysql
57
+ Provides-Extra: soda-core-postgres
58
+ Provides-Extra: stem
59
+ Provides-Extra: tqdm
60
+ Provides-Extra: undetected-chromedriver
61
+ Provides-Extra: unidecode
62
+ Provides-Extra: xmltodict
63
+ Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
64
+ Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
65
+ Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
66
+ Requires-Dist: boto3 (>=1.0.0,<2.0.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
67
+ Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
68
+ Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
69
+ Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
70
+ Requires-Dist: croniter (>=3.0.0,<4.0.0)
71
+ Requires-Dist: datetime (>=5.0,<6.0) ; extra == "datetime"
72
+ Requires-Dist: demjson3 (>=3.0.0,<4.0.0) ; extra == "demjson3"
73
+ Requires-Dist: dnspython (>=2.0.0,<3.0.0) ; extra == "dnspython"
74
+ Requires-Dist: duckduckgo-search (>=7.0.0,<8.0.0) ; extra == "duckduckgo-search"
75
+ Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
76
+ Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
77
+ Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
78
+ Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
79
+ Requires-Dist: geopy (>=2.0.0,<3.0.0) ; extra == "geopy"
80
+ Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
81
+ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
82
+ Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
83
+ Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
84
+ Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
85
+ Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
86
+ Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
87
+ Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
88
+ Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
89
+ Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
90
+ Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
91
+ Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
92
+ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
93
+ Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
94
+ Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
95
+ Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
96
+ Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
97
+ Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
98
+ Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
99
+ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
100
+ Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
101
+ Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
102
+ Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
103
+ Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
104
+ Requires-Dist: requests (>=2.0.0,<3.0.0)
105
+ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
106
+ Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
107
+ Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
108
+ Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
109
+ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
110
+ Requires-Dist: tenacity (>=9.0.0,<10.0.0)
111
+ Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
112
+ Requires-Dist: typer (>=0.15.0,<0.16.0)
113
+ Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
114
+ Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
115
+ Project-URL: Documentation, https://github.com/Data-Market/datamarket
116
+ Project-URL: Homepage, https://datamarket.es
117
+ Project-URL: Repository, https://github.com/Data-Market/datamarket
118
+ Description-Content-Type: text/markdown
119
+
120
+ # DataMarket scraping core
121
+
122
+ ------------------------------------------------------
123
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
124
+
125
+
126
+ Utilities that integrate advance scraping knowledge into just one library.
127
+
128
+ ## Installation
129
+
130
+ To install this library in your Python environment:
131
+
132
+ `pip install datamarket`
133
+
134
+ ## Documentation
135
+
136
+ This library has built functionalities for the following topics:
137
+
138
+ - **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
139
+ - **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
140
+ - **Tinybird**: a Python client for this popular API.
141
+ - **Drive**: functions to upload, delete or authenticate to Google Drive.
142
+ - **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
143
+ - **Selenium**: wrapper for the main Selenium functions.
144
+
@@ -0,0 +1,129 @@
1
+ [tool.poetry]
2
+ name = "datamarket"
3
+ version = "0.9.4"
4
+ description = "Utilities that integrate advanced scraping knowledge into just one library."
5
+ authors = ["DataMarket <techsupport@datamarket.es>"]
6
+ license = "GPL-3.0-or-later"
7
+ readme = "README.md"
8
+ homepage = "https://datamarket.es"
9
+ repository = "https://github.com/Data-Market/datamarket"
10
+ documentation = "https://github.com/Data-Market/datamarket"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
14
+ "Operating System :: OS Independent",
15
+ ]
16
+
17
+ [tool.poetry.dependencies]
18
+ python = "^3.12"
19
+ typer = "~0.15.0"
20
+ psycopg2-binary = "^2.0.0"
21
+ requests = "^2.0.0"
22
+ tenacity = "^9.0.0"
23
+ beautifulsoup4 = "^4.0.0"
24
+ pre-commit = "^4.0.0"
25
+ pendulum = "^3.0.0"
26
+ croniter = "^3.0.0"
27
+ dynaconf = "^3.0.0"
28
+ jinja2 = "^3.0.0"
29
+
30
+ boto3 = { version = "^1.0.0", optional = true }
31
+ unidecode = { version = "^1.0.0", optional = true }
32
+ lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
33
+ tqdm = { version = "^4.0.0", optional = true }
34
+ pandas = { version = "^2.0.0", optional = true }
35
+ pyarrow = { version = "^19.0.0", optional = true }
36
+ pytest = { version = "^8.0.0", optional = true }
37
+ playwright = { version = "1.47.0", optional = true }
38
+ tf-playwright-stealth = { version = "^1.0.0", optional = true }
39
+ soda-core-postgres = { version = "^3.0.0", optional = true }
40
+ soda-core-mysql = { version = "^3.0.0", optional = true }
41
+ fake-useragent = { version = "^2.0.0", optional = true }
42
+ pydrive2 = { version = "^1.0.0", optional = true }
43
+ clickhouse-driver = { version = "~0.2.0", optional = true }
44
+ stem = { version = "^1.0.0", optional = true }
45
+ click = { version = "^8.0.0", optional = true }
46
+ rapidfuzz = { version = "^3.0.0", optional = true }
47
+ demjson3 = { version = "^3.0.0", optional = true }
48
+ geopy = { version = "^2.0.0", optional = true }
49
+ nodriver = { version = "0.38.post1", optional = true }
50
+ retry = { version = "~0.9.0", optional = true }
51
+ shapely = { version = "^2.0.0", optional = true }
52
+ geopandas = { version = "^1.0.0", optional = true }
53
+ chompjs = { version = "^1.0.0", optional = true }
54
+ pillow = { version = "^11.0.0", optional = true }
55
+ duckduckgo-search = { version = "^7.0.0", optional = true }
56
+ pysocks = { version = "^1.0.0", optional = true }
57
+ xmltodict = { version = "~0.14.0", optional = true }
58
+ pymupdf = { version = "^1.0.0", optional = true }
59
+ html2text = { version = "^2024.0.0", optional = true }
60
+ pyspark = { version = "^3.0.0", optional = true }
61
+ pandera = { version = "~0.22.0", optional = true }
62
+ json5 = { version = "~0.10.0", optional = true }
63
+ geoalchemy2 = { version = "~0.17.0", optional = true }
64
+ datetime = { version = "^5.0", optional = true }
65
+ azure-storage-blob = { version = "^12.0.0", optional = true }
66
+ google-api-python-client = { version = "^2.0.0", optional = true }
67
+ google-auth-httplib2 = { version = "~0.2.0", optional = true }
68
+ google-auth-oauthlib = { version = "^1.0.0", optional = true }
69
+ dnspython = { version = "^2.0.0", optional = true }
70
+ openpyxl = { version = "^3.0.0", optional = true }
71
+ httpx = { extras = ["http2"], version = "~0.28.0", optional = true }
72
+ SQLAlchemy = { version = "^2.0.0", optional = true }
73
+
74
+ [tool.poetry.extras]
75
+ boto3 = ["boto3"]
76
+ unidecode = ["unidecode"]
77
+ lxml = ["lxml"]
78
+ tqdm = ["tqdm"]
79
+ pandas = ["pandas"]
80
+ pyarrow = ["pyarrow"]
81
+ pytest = ["pytest"]
82
+ playwright = ["playwright"]
83
+ playwright-stealth = ["playwright-stealth"]
84
+ soda-core-postgres = ["soda-core-postgres"]
85
+ soda-core-mysql = ["soda-core-mysql"]
86
+ fake-useragent = ["fake-useragent"]
87
+ pydrive2 = ["pydrive2"]
88
+ clickhouse-driver = ["clickhouse-driver"]
89
+ stem = ["stem"]
90
+ click = ["click"]
91
+ rapidfuzz = ["rapidfuzz"]
92
+ demjson3 = ["demjson3"]
93
+ geopy = ["geopy"]
94
+ nodriver = ["nodriver"]
95
+ undetected-chromedriver = ["undetected-chromedriver"]
96
+ retry = ["retry"]
97
+ shapely = ["shapely"]
98
+ geopandas = ["geopandas"]
99
+ chompjs = ["chompjs"]
100
+ pillow = ["pillow"]
101
+ duckduckgo-search = ["duckduckgo-search"]
102
+ pysocks = ["pysocks"]
103
+ xmltodict = ["xmltodict"]
104
+ pymupdf = ["pymupdf"]
105
+ html2text = ["html2text"]
106
+ pyspark = ["pyspark"]
107
+ pandera = ["pandera"]
108
+ json5 = ["json5"]
109
+ geoalchemy2 = ["geoalchemy2"]
110
+ datetime = ["datetime"]
111
+ azure-storage-blob = ["azure-storage-blob"]
112
+ google-api-python-client = ["google-api-python-client"]
113
+ google-auth-httplib2 = ["google-auth-httplib2"]
114
+ google-auth-oauthlib = ["google-auth-oauthlib"]
115
+ dnspython = ["dnspython"]
116
+ openpyxl = ["openpyxl"]
117
+ httpx = ["httpx"]
118
+
119
+ # Interface groups
120
+ aws = ["boto3"]
121
+ drive = ["pydrive2"]
122
+ peerdb = ["boto3", "clickhouse-driver"]
123
+ proxy = ["stem"]
124
+ alchemy = ["SQLAlchemy"]
125
+
126
+
127
+ [build-system]
128
+ requires = ["poetry-core>=1.0.0"]
129
+ build-backend = "poetry.core.masonry.api"
@@ -5,6 +5,8 @@ import io
5
5
  import logging
6
6
  import boto3
7
7
 
8
+ from ..utils.main import Config
9
+
8
10
  ########################################################################################################################
9
11
  # CLASSES
10
12
 
@@ -12,7 +14,7 @@ logger = logging.getLogger(__name__)
12
14
 
13
15
 
14
16
  class AWSInterface:
15
- def __init__(self, config):
17
+ def __init__(self, config: Config) -> None:
16
18
  self.profiles = []
17
19
  self.config = config
18
20
 
@@ -31,13 +33,13 @@ class AWSInterface:
31
33
  self.current_profile = self.profiles[0] if self.profiles else None
32
34
  self._update_resources()
33
35
 
34
- def _update_resources(self):
36
+ def _update_resources(self) -> None:
35
37
  if self.current_profile:
36
38
  self.s3 = self.current_profile["session"].resource("s3")
37
39
  self.s3_client = self.s3.meta.client
38
40
  self.bucket = self.current_profile["buckets"][0]
39
41
 
40
- def switch_profile(self, profile_name: str):
42
+ def switch_profile(self, profile_name: str) -> None:
41
43
  for profile in self.profiles:
42
44
  if profile["profile"] == profile_name:
43
45
  self.current_profile = profile
@@ -45,7 +47,7 @@ class AWSInterface:
45
47
  return
46
48
  logger.warning(f"Profile {profile_name} not found")
47
49
 
48
- def switch_bucket(self, bucket: str):
50
+ def switch_bucket(self, bucket: str) -> None:
49
51
  if bucket not in self.current_profile["buckets"]:
50
52
  logger.warning(
51
53
  f"Bucket {bucket} not found in profile {self.current_profile['profile']}"
@@ -54,14 +56,14 @@ class AWSInterface:
54
56
 
55
57
  self.bucket = bucket
56
58
 
57
- def get_file(self, s3_path: str):
59
+ def get_file(self, s3_path: str) -> None:
58
60
  try:
59
61
  return self.s3.Object(self.bucket, s3_path).get()
60
62
  except self.s3_client.exceptions.NoSuchKey:
61
63
  logger.info(f"{s3_path} does not exist")
62
64
 
63
- def read_file_as_bytes(self, s3_path: str):
65
+ def read_file_as_bytes(self, s3_path: str) -> io.BytesIO:
64
66
  return io.BytesIO(self.get_file(s3_path)["Body"].read())
65
67
 
66
- def upload_file(self, local_path: str, s3_path: str):
68
+ def upload_file(self, local_path: str, s3_path: str) -> None:
67
69
  self.s3.Bucket(self.bucket).upload_file(local_path, s3_path)
@@ -5,6 +5,8 @@ import logging
5
5
  from ftplib import FTP, FTP_TLS
6
6
  from pathlib import Path
7
7
 
8
+ from ..utils.main import Config
9
+
8
10
  ########################################################################################################################
9
11
  # CLASSES
10
12
 
@@ -12,7 +14,7 @@ logger = logging.getLogger(__name__)
12
14
 
13
15
 
14
16
  class FTPInterface:
15
- def __init__(self, config):
17
+ def __init__(self, config: Config):
16
18
  if "ftp" in config:
17
19
  self.config = config["ftp"]
18
20
 
@@ -216,52 +216,26 @@ class TransientS3:
216
216
  self.config = section
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_client = self.session.client("s3")
219
+ self.s3_resource = self.session.resource("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_client.meta.endpoint_url
224
+ self.endpoint_url = self.s3_resource.meta.endpoint_url
225
225
  else:
226
226
  logger.warning("no peerdb.s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- paginator = self.s3_client.get_paginator("list_objects_v2")
232
- pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
231
+ bucket = self.s3_resource.Bucket(self.bucket_name)
233
232
 
234
- for page in pages:
235
- if "CommonPrefixes" in page:
236
- for prefix in page["CommonPrefixes"]:
237
- folder = prefix["Prefix"]
238
- if schema_name in folder:
239
- self._delete_folder_contents(folder)
233
+ for prefix in [schema_name, f"clone_{schema_name}"]:
234
+ objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
+ objects_to_delete.delete()
240
236
 
241
237
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
242
238
 
243
- def _delete_folder_contents(self, folder):
244
- logger.info(f"Deleting contents of folder: {folder}")
245
-
246
- paginator = self.s3_client.get_paginator("list_objects_v2")
247
- pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
-
249
- delete_us = dict(Objects=[])
250
- for page in pages:
251
- if "Contents" in page:
252
- for obj in page["Contents"]:
253
- delete_us["Objects"].append(dict(Key=obj["Key"]))
254
-
255
- # AWS limits to deleting 1000 objects at a time
256
- if len(delete_us["Objects"]) >= 1000:
257
- self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
- delete_us = dict(Objects=[])
259
-
260
- if len(delete_us["Objects"]):
261
- self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
-
263
- logger.info(f"Deleted contents of folder: {folder}")
264
-
265
239
 
266
240
  class PeerDBInterface:
267
241
  def __init__(self, config):
@@ -308,11 +282,11 @@ class PeerDBInterface:
308
282
  if not self.docker_host_mapping or not host:
309
283
  return host
310
284
 
311
- if host in ['localhost', '127.0.0.1']:
285
+ if host in ["localhost", "127.0.0.1"]:
312
286
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
313
287
  return self.docker_host_mapping
314
288
 
315
- url_pattern = r'(localhost|127\.0\.0\.1)'
289
+ url_pattern = r"(localhost|127\.0\.0\.1)"
316
290
  match = re.search(url_pattern, host)
317
291
  if match:
318
292
  original_host = match.group(1)
@@ -2,7 +2,6 @@
2
2
  # IMPORTS
3
3
 
4
4
  import asyncio
5
- import configparser
6
5
  import inspect
7
6
  import logging
8
7
  import random
@@ -16,6 +15,7 @@ from typing import Literal, Union
16
15
 
17
16
  import pendulum
18
17
  from croniter import croniter
18
+ from configparser import RawConfigParser
19
19
  from dynaconf import Dynaconf, add_converter
20
20
 
21
21
  ########################################################################################################################
@@ -23,6 +23,8 @@ from dynaconf import Dynaconf, add_converter
23
23
 
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
+ Config = Union[RawConfigParser, Dynaconf]
27
+
26
28
 
27
29
  def get_granular_date(
28
30
  granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str], tz: str = "Europe/Madrid"
@@ -64,16 +66,20 @@ def read_converter(path_str: str):
64
66
  return f.read()
65
67
 
66
68
 
67
- def get_config(config_file: Path, tz: str = "Europe/Madrid"):
69
+ def get_config(
70
+ config_file: Path, tz: str = "Europe/Madrid"
71
+ ) -> Union[RawConfigParser, Dynaconf]:
68
72
  if Path(config_file).suffix == ".ini":
69
73
  logger.warning("Using legacy INI config reader. Please migrate to TOML")
70
- cfg = configparser.RawConfigParser()
74
+ cfg = RawConfigParser()
71
75
  cfg.read(config_file)
72
76
  return cfg
73
77
 
74
78
  add_converter("read", read_converter)
75
79
 
76
80
  dt_now = get_granular_date("now", tz)
81
+ dt_weekly = get_granular_date("weekly", tz)
82
+ dt_biweekly = get_granular_date("biweekly", tz)
77
83
 
78
84
  config = Dynaconf(
79
85
  environments=True,
@@ -84,14 +90,17 @@ def get_config(config_file: Path, tz: str = "Europe/Madrid"):
84
90
  config.load_file(path=Path.home() / config_file.name)
85
91
 
86
92
  config.vars = {
87
- "now": dt_now.strftime("%Y-%m-%d %H:%M:%S"),
88
- "today": dt_now.strftime("%Y-%m-%d"),
89
93
  "year": dt_now.strftime("%Y"),
90
94
  "month": dt_now.strftime("%m"),
91
95
  "day": dt_now.strftime("%d"),
92
- "biweekly_date": get_granular_date("biweekly", tz).strftime("%Y-%m-%d"),
93
- "today_stripped": dt_now.strftime("%Y%m%d"),
96
+ "now": dt_now.strftime("%Y-%m-%d %H:%M:%S"),
94
97
  "now_stripped": dt_now.strftime("%Y%m%d%H%M%S"),
98
+ "today": dt_now.strftime("%Y-%m-%d"),
99
+ "today_stripped": dt_now.strftime("%Y%m%d"),
100
+ "weekly_date": dt_weekly.strftime("%Y-%m-%d"),
101
+ "weekly_date_stripped": dt_weekly.strftime("%Y%m%d"),
102
+ "biweekly_date": dt_biweekly.strftime("%Y-%m-%d"),
103
+ "biweekly_date_stripped": dt_biweekly.strftime("%Y%m%d"),
95
104
  "dynaconf_merge": True,
96
105
  }
97
106
 
datamarket-0.9.2/PKG-INFO DELETED
@@ -1,149 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: datamarket
3
- Version: 0.9.2
4
- Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
- License: GPL-3.0-or-later
6
- Author: DataMarket
7
- Author-email: techsupport@datamarket.es
8
- Requires-Python: >=3.9,<4.0
9
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
- Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.9
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
- Classifier: Programming Language :: Python :: 3.13
18
- Provides-Extra: alchemy
19
- Provides-Extra: aws
20
- Provides-Extra: azure-storage-blob
21
- Provides-Extra: boto3
22
- Provides-Extra: chompjs
23
- Provides-Extra: click
24
- Provides-Extra: clickhouse-driver
25
- Provides-Extra: datetime
26
- Provides-Extra: demjson3
27
- Provides-Extra: dnspython
28
- Provides-Extra: drive
29
- Provides-Extra: duckduckgo-search
30
- Provides-Extra: fake-useragent
31
- Provides-Extra: geoalchemy2
32
- Provides-Extra: geopandas
33
- Provides-Extra: geopy
34
- Provides-Extra: google-api-python-client
35
- Provides-Extra: google-auth-httplib2
36
- Provides-Extra: google-auth-oauthlib
37
- Provides-Extra: html2text
38
- Provides-Extra: httpx
39
- Provides-Extra: json5
40
- Provides-Extra: lxml
41
- Provides-Extra: nodriver
42
- Provides-Extra: openpyxl
43
- Provides-Extra: pandas
44
- Provides-Extra: pandera
45
- Provides-Extra: peerdb
46
- Provides-Extra: pillow
47
- Provides-Extra: playwright
48
- Provides-Extra: playwright-stealth
49
- Provides-Extra: proxy
50
- Provides-Extra: pyarrow
51
- Provides-Extra: pydrive2
52
- Provides-Extra: pymupdf
53
- Provides-Extra: pysocks
54
- Provides-Extra: pyspark
55
- Provides-Extra: pytest
56
- Provides-Extra: rapidfuzz
57
- Provides-Extra: retry
58
- Provides-Extra: shapely
59
- Provides-Extra: soda-core-mysql
60
- Provides-Extra: soda-core-postgres
61
- Provides-Extra: stem
62
- Provides-Extra: tqdm
63
- Provides-Extra: undetected-chromedriver
64
- Provides-Extra: unidecode
65
- Provides-Extra: xmltodict
66
- Requires-Dist: SQLAlchemy (==2.0.36) ; extra == "alchemy"
67
- Requires-Dist: azure-storage-blob (==12.23.1) ; extra == "azure-storage-blob"
68
- Requires-Dist: beautifulsoup4 (==4.12.3)
69
- Requires-Dist: boto3 (==1.35.53) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
70
- Requires-Dist: chompjs (==1.3.0) ; extra == "chompjs"
71
- Requires-Dist: click (==8.1.7) ; extra == "click"
72
- Requires-Dist: clickhouse-driver (==0.2.9) ; extra == "clickhouse-driver" or extra == "peerdb"
73
- Requires-Dist: croniter (==3.0.4)
74
- Requires-Dist: datetime (==5.5) ; extra == "datetime"
75
- Requires-Dist: demjson3 (==3.0.6) ; extra == "demjson3"
76
- Requires-Dist: dnspython (==2.7.0) ; extra == "dnspython"
77
- Requires-Dist: duckduckgo-search (==6.2.11b1) ; extra == "duckduckgo-search"
78
- Requires-Dist: dynaconf (==3.2.6)
79
- Requires-Dist: fake-useragent (==1.5.1) ; extra == "fake-useragent"
80
- Requires-Dist: geoalchemy2 (==0.15.2) ; extra == "geoalchemy2"
81
- Requires-Dist: geopandas (==1.0.1) ; extra == "geopandas"
82
- Requires-Dist: geopy (==2.4.1) ; extra == "geopy"
83
- Requires-Dist: google-api-python-client (==2.151.0) ; extra == "google-api-python-client"
84
- Requires-Dist: google-auth-httplib2 (==0.2.0) ; extra == "google-auth-httplib2"
85
- Requires-Dist: google-auth-oauthlib (==1.2.1) ; extra == "google-auth-oauthlib"
86
- Requires-Dist: html2text (==2024.2.26) ; extra == "html2text"
87
- Requires-Dist: httpx[http2] (==0.28.1) ; extra == "httpx"
88
- Requires-Dist: jinja2 (==3.1.5)
89
- Requires-Dist: json5 (==0.9.25) ; extra == "json5"
90
- Requires-Dist: lxml[html-clean] (==5.3.0) ; extra == "lxml"
91
- Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
92
- Requires-Dist: openpyxl (==3.1.5) ; extra == "openpyxl"
93
- Requires-Dist: pandas (==2.2.3) ; extra == "pandas"
94
- Requires-Dist: pandera (==0.20.4) ; extra == "pandera"
95
- Requires-Dist: pendulum (==3.0.0)
96
- Requires-Dist: pillow (==11.0.0) ; extra == "pillow"
97
- Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
98
- Requires-Dist: playwright-stealth (==1.0.6) ; extra == "playwright-stealth"
99
- Requires-Dist: pre-commit (==4.0.1)
100
- Requires-Dist: psycopg2-binary (==2.9.10)
101
- Requires-Dist: pyarrow (==17.0.0) ; extra == "pyarrow"
102
- Requires-Dist: pydrive2 (==1.20.0) ; extra == "pydrive2" or extra == "drive"
103
- Requires-Dist: pymupdf (==1.24.13) ; extra == "pymupdf"
104
- Requires-Dist: pysocks (==1.7.1) ; extra == "pysocks"
105
- Requires-Dist: pyspark (==3.5.3) ; extra == "pyspark"
106
- Requires-Dist: pytest (==8.3.3) ; extra == "pytest"
107
- Requires-Dist: rapidfuzz (==3.10.1) ; extra == "rapidfuzz"
108
- Requires-Dist: requests (==2.32.3)
109
- Requires-Dist: retry (==0.9.2) ; extra == "retry"
110
- Requires-Dist: shapely (==2.0.6) ; extra == "shapely"
111
- Requires-Dist: soda-core-mysql (==3.4.4) ; extra == "soda-core-mysql"
112
- Requires-Dist: soda-core-postgres (==3.4.1) ; extra == "soda-core-postgres"
113
- Requires-Dist: stem (==1.8.2) ; extra == "stem" or extra == "proxy"
114
- Requires-Dist: tenacity (==9.0.0)
115
- Requires-Dist: tqdm (==4.66.6) ; extra == "tqdm"
116
- Requires-Dist: typer (==0.12.5)
117
- Requires-Dist: undetected-chromedriver (==3.5.5) ; extra == "undetected-chromedriver"
118
- Requires-Dist: unidecode (==1.3.8) ; extra == "unidecode"
119
- Requires-Dist: xmltodict (==0.14.2) ; extra == "xmltodict"
120
- Project-URL: Documentation, https://github.com/Data-Market/datamarket
121
- Project-URL: Homepage, https://datamarket.es
122
- Project-URL: Repository, https://github.com/Data-Market/datamarket
123
- Description-Content-Type: text/markdown
124
-
125
- # DataMarket scraping core
126
-
127
- ------------------------------------------------------
128
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
129
-
130
-
131
- Utilities that integrate advance scraping knowledge into just one library.
132
-
133
- ## Installation
134
-
135
- To install this library in your Python environment:
136
-
137
- `pip install datamarket`
138
-
139
- ## Documentation
140
-
141
- This library has built functionalities for the following topics:
142
-
143
- - **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
144
- - **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
145
- - **Tinybird**: a Python client for this popular API.
146
- - **Drive**: functions to upload, delete or authenticate to Google Drive.
147
- - **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
148
- - **Selenium**: wrapper for the main Selenium functions.
149
-
@@ -1,130 +0,0 @@
1
- [tool.poetry]
2
- name = "datamarket"
3
- version = "0.9.2"
4
- description = "Utilities that integrate advanced scraping knowledge into just one library."
5
- authors = ["DataMarket <techsupport@datamarket.es>"]
6
- license = "GPL-3.0-or-later"
7
- readme = "README.md"
8
- homepage = "https://datamarket.es"
9
- repository = "https://github.com/Data-Market/datamarket"
10
- documentation = "https://github.com/Data-Market/datamarket"
11
- classifiers = [
12
- "Programming Language :: Python :: 3",
13
- "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
14
- "Operating System :: OS Independent",
15
- ]
16
-
17
- [tool.poetry.dependencies]
18
- python = "^3.9"
19
- typer = "0.12.5"
20
- psycopg2-binary = "2.9.10"
21
- requests = "2.32.3"
22
- tenacity = "9.0.0"
23
- beautifulsoup4 = "4.12.3"
24
- pre-commit = "4.0.1"
25
- pendulum = "3.0.0"
26
- croniter = "3.0.4"
27
- dynaconf = "3.2.6"
28
- jinja2 = "3.1.5"
29
-
30
- boto3 = { version = "1.35.53", optional = true }
31
- unidecode = { version = "1.3.8", optional = true }
32
- lxml = { extras = ["html-clean"], version = "5.3.0", optional = true }
33
- tqdm = { version = "4.66.6", optional = true }
34
- pandas = { version = "2.2.3", optional = true }
35
- pyarrow = { version = "17.0.0", optional = true }
36
- pytest = { version = "8.3.3", optional = true }
37
- playwright = { version = "1.47.0", optional = true }
38
- playwright-stealth = { version = "1.0.6", optional = true }
39
- soda-core-postgres = { version = "3.4.1", optional = true }
40
- soda-core-mysql = { version = "3.4.4", optional = true }
41
- fake-useragent = { version = "1.5.1", optional = true }
42
- pydrive2 = { version = "1.20.0", optional = true }
43
- clickhouse-driver = { version = "0.2.9", optional = true }
44
- stem = { version = "1.8.2", optional = true }
45
- click = { version = "8.1.7", optional = true }
46
- rapidfuzz = { version = "3.10.1", optional = true }
47
- demjson3 = { version = "3.0.6", optional = true }
48
- geopy = { version = "2.4.1", optional = true }
49
- nodriver = { version = "0.38.post1", optional = true }
50
- undetected-chromedriver = { version = "3.5.5", optional = true }
51
- retry = { version = "0.9.2", optional = true }
52
- shapely = { version = "2.0.6", optional = true }
53
- geopandas = { version = "1.0.1", optional = true }
54
- chompjs = { version = "1.3.0", optional = true }
55
- pillow = { version = "11.0.0", optional = true }
56
- duckduckgo-search = { version = "6.2.11b1", optional = true }
57
- pysocks = { version = "1.7.1", optional = true }
58
- xmltodict = { version = "0.14.2", optional = true }
59
- pymupdf = { version = "1.24.13", optional = true }
60
- html2text = { version = "2024.2.26", optional = true }
61
- pyspark = { version = "3.5.3", optional = true }
62
- pandera = { version = "0.20.4", optional = true }
63
- json5 = { version = "0.9.25", optional = true }
64
- geoalchemy2 = { version = "0.15.2", optional = true }
65
- datetime = { version = "5.5", optional = true }
66
- azure-storage-blob = { version = "12.23.1", optional = true }
67
- google-api-python-client = { version = "2.151.0", optional = true }
68
- google-auth-httplib2 = { version = "0.2.0", optional = true }
69
- google-auth-oauthlib = { version = "1.2.1", optional = true }
70
- dnspython = { version = "2.7.0", optional = true }
71
- openpyxl = { version = "3.1.5", optional = true }
72
- httpx = { extras = ["http2"], version = "0.28.1", optional = true }
73
- SQLAlchemy = { version = "2.0.36", optional = true }
74
-
75
- [tool.poetry.extras]
76
- boto3 = ["boto3"]
77
- unidecode = ["unidecode"]
78
- lxml = ["lxml"]
79
- tqdm = ["tqdm"]
80
- pandas = ["pandas"]
81
- pyarrow = ["pyarrow"]
82
- pytest = ["pytest"]
83
- playwright = ["playwright"]
84
- playwright-stealth = ["playwright-stealth"]
85
- soda-core-postgres = ["soda-core-postgres"]
86
- soda-core-mysql = ["soda-core-mysql"]
87
- fake-useragent = ["fake-useragent"]
88
- pydrive2 = ["pydrive2"]
89
- clickhouse-driver = ["clickhouse-driver"]
90
- stem = ["stem"]
91
- click = ["click"]
92
- rapidfuzz = ["rapidfuzz"]
93
- demjson3 = ["demjson3"]
94
- geopy = ["geopy"]
95
- nodriver = ["nodriver"]
96
- undetected-chromedriver = ["undetected-chromedriver"]
97
- retry = ["retry"]
98
- shapely = ["shapely"]
99
- geopandas = ["geopandas"]
100
- chompjs = ["chompjs"]
101
- pillow = ["pillow"]
102
- duckduckgo-search = ["duckduckgo-search"]
103
- pysocks = ["pysocks"]
104
- xmltodict = ["xmltodict"]
105
- pymupdf = ["pymupdf"]
106
- html2text = ["html2text"]
107
- pyspark = ["pyspark"]
108
- pandera = ["pandera"]
109
- json5 = ["json5"]
110
- geoalchemy2 = ["geoalchemy2"]
111
- datetime = ["datetime"]
112
- azure-storage-blob = ["azure-storage-blob"]
113
- google-api-python-client = ["google-api-python-client"]
114
- google-auth-httplib2 = ["google-auth-httplib2"]
115
- google-auth-oauthlib = ["google-auth-oauthlib"]
116
- dnspython = ["dnspython"]
117
- openpyxl = ["openpyxl"]
118
- httpx = ["httpx"]
119
-
120
- # Interface groups
121
- aws = ["boto3"]
122
- drive = ["pydrive2"]
123
- peerdb = ["boto3", "clickhouse-driver"]
124
- proxy = ["stem"]
125
- alchemy = ["SQLAlchemy"]
126
-
127
-
128
- [build-system]
129
- requires = ["poetry-core>=1.0.0"]
130
- build-backend = "poetry.core.masonry.api"
File without changes
File without changes