datamarket 0.9.2__tar.gz → 0.9.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket-0.9.4/PKG-INFO +144 -0
- datamarket-0.9.4/pyproject.toml +129 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/aws.py +9 -7
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/ftp.py +3 -1
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/peerdb.py +8 -34
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/main.py +16 -7
- datamarket-0.9.2/PKG-INFO +0 -149
- datamarket-0.9.2/pyproject.toml +0 -130
- {datamarket-0.9.2 → datamarket-0.9.4}/LICENSE +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/README.md +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/__init__.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/alchemy.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/nominatim.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/params/nominatim.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/__init__.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.9.2 → datamarket-0.9.4}/src/datamarket/utils/typer.py +0 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: datamarket
|
|
3
|
+
Version: 0.9.4
|
|
4
|
+
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
+
License: GPL-3.0-or-later
|
|
6
|
+
Author: DataMarket
|
|
7
|
+
Author-email: techsupport@datamarket.es
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Provides-Extra: alchemy
|
|
16
|
+
Provides-Extra: aws
|
|
17
|
+
Provides-Extra: azure-storage-blob
|
|
18
|
+
Provides-Extra: boto3
|
|
19
|
+
Provides-Extra: chompjs
|
|
20
|
+
Provides-Extra: click
|
|
21
|
+
Provides-Extra: clickhouse-driver
|
|
22
|
+
Provides-Extra: datetime
|
|
23
|
+
Provides-Extra: demjson3
|
|
24
|
+
Provides-Extra: dnspython
|
|
25
|
+
Provides-Extra: drive
|
|
26
|
+
Provides-Extra: duckduckgo-search
|
|
27
|
+
Provides-Extra: fake-useragent
|
|
28
|
+
Provides-Extra: geoalchemy2
|
|
29
|
+
Provides-Extra: geopandas
|
|
30
|
+
Provides-Extra: geopy
|
|
31
|
+
Provides-Extra: google-api-python-client
|
|
32
|
+
Provides-Extra: google-auth-httplib2
|
|
33
|
+
Provides-Extra: google-auth-oauthlib
|
|
34
|
+
Provides-Extra: html2text
|
|
35
|
+
Provides-Extra: httpx
|
|
36
|
+
Provides-Extra: json5
|
|
37
|
+
Provides-Extra: lxml
|
|
38
|
+
Provides-Extra: nodriver
|
|
39
|
+
Provides-Extra: openpyxl
|
|
40
|
+
Provides-Extra: pandas
|
|
41
|
+
Provides-Extra: pandera
|
|
42
|
+
Provides-Extra: peerdb
|
|
43
|
+
Provides-Extra: pillow
|
|
44
|
+
Provides-Extra: playwright
|
|
45
|
+
Provides-Extra: playwright-stealth
|
|
46
|
+
Provides-Extra: proxy
|
|
47
|
+
Provides-Extra: pyarrow
|
|
48
|
+
Provides-Extra: pydrive2
|
|
49
|
+
Provides-Extra: pymupdf
|
|
50
|
+
Provides-Extra: pysocks
|
|
51
|
+
Provides-Extra: pyspark
|
|
52
|
+
Provides-Extra: pytest
|
|
53
|
+
Provides-Extra: rapidfuzz
|
|
54
|
+
Provides-Extra: retry
|
|
55
|
+
Provides-Extra: shapely
|
|
56
|
+
Provides-Extra: soda-core-mysql
|
|
57
|
+
Provides-Extra: soda-core-postgres
|
|
58
|
+
Provides-Extra: stem
|
|
59
|
+
Provides-Extra: tqdm
|
|
60
|
+
Provides-Extra: undetected-chromedriver
|
|
61
|
+
Provides-Extra: unidecode
|
|
62
|
+
Provides-Extra: xmltodict
|
|
63
|
+
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
|
|
64
|
+
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
65
|
+
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
66
|
+
Requires-Dist: boto3 (>=1.0.0,<2.0.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
67
|
+
Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
|
|
68
|
+
Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
|
|
69
|
+
Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
70
|
+
Requires-Dist: croniter (>=3.0.0,<4.0.0)
|
|
71
|
+
Requires-Dist: datetime (>=5.0,<6.0) ; extra == "datetime"
|
|
72
|
+
Requires-Dist: demjson3 (>=3.0.0,<4.0.0) ; extra == "demjson3"
|
|
73
|
+
Requires-Dist: dnspython (>=2.0.0,<3.0.0) ; extra == "dnspython"
|
|
74
|
+
Requires-Dist: duckduckgo-search (>=7.0.0,<8.0.0) ; extra == "duckduckgo-search"
|
|
75
|
+
Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
|
|
76
|
+
Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
|
|
77
|
+
Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
|
|
78
|
+
Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
|
|
79
|
+
Requires-Dist: geopy (>=2.0.0,<3.0.0) ; extra == "geopy"
|
|
80
|
+
Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
|
|
81
|
+
Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
|
|
82
|
+
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
|
|
83
|
+
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
84
|
+
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
85
|
+
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
86
|
+
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
87
|
+
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
88
|
+
Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
|
|
89
|
+
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
90
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
91
|
+
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
92
|
+
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
93
|
+
Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
|
|
94
|
+
Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
|
|
95
|
+
Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
|
|
96
|
+
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
|
|
97
|
+
Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
98
|
+
Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
|
|
99
|
+
Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
100
|
+
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
101
|
+
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
102
|
+
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
103
|
+
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
|
|
104
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
105
|
+
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
106
|
+
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
107
|
+
Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
108
|
+
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
109
|
+
Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
|
|
110
|
+
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
111
|
+
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
112
|
+
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
113
|
+
Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
|
|
114
|
+
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
115
|
+
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
116
|
+
Project-URL: Homepage, https://datamarket.es
|
|
117
|
+
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
118
|
+
Description-Content-Type: text/markdown
|
|
119
|
+
|
|
120
|
+
# DataMarket scraping core
|
|
121
|
+
|
|
122
|
+
------------------------------------------------------
|
|
123
|
+
[](https://github.com/psf/black)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
Utilities that integrate advance scraping knowledge into just one library.
|
|
127
|
+
|
|
128
|
+
## Installation
|
|
129
|
+
|
|
130
|
+
To install this library in your Python environment:
|
|
131
|
+
|
|
132
|
+
`pip install datamarket`
|
|
133
|
+
|
|
134
|
+
## Documentation
|
|
135
|
+
|
|
136
|
+
This library has built functionalities for the following topics:
|
|
137
|
+
|
|
138
|
+
- **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
|
|
139
|
+
- **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
|
|
140
|
+
- **Tinybird**: a Python client for this popular API.
|
|
141
|
+
- **Drive**: functions to upload, delete or authenticate to Google Drive.
|
|
142
|
+
- **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
|
|
143
|
+
- **Selenium**: wrapper for the main Selenium functions.
|
|
144
|
+
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "datamarket"
|
|
3
|
+
version = "0.9.4"
|
|
4
|
+
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
|
+
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
|
+
license = "GPL-3.0-or-later"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
homepage = "https://datamarket.es"
|
|
9
|
+
repository = "https://github.com/Data-Market/datamarket"
|
|
10
|
+
documentation = "https://github.com/Data-Market/datamarket"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.poetry.dependencies]
|
|
18
|
+
python = "^3.12"
|
|
19
|
+
typer = "~0.15.0"
|
|
20
|
+
psycopg2-binary = "^2.0.0"
|
|
21
|
+
requests = "^2.0.0"
|
|
22
|
+
tenacity = "^9.0.0"
|
|
23
|
+
beautifulsoup4 = "^4.0.0"
|
|
24
|
+
pre-commit = "^4.0.0"
|
|
25
|
+
pendulum = "^3.0.0"
|
|
26
|
+
croniter = "^3.0.0"
|
|
27
|
+
dynaconf = "^3.0.0"
|
|
28
|
+
jinja2 = "^3.0.0"
|
|
29
|
+
|
|
30
|
+
boto3 = { version = "^1.0.0", optional = true }
|
|
31
|
+
unidecode = { version = "^1.0.0", optional = true }
|
|
32
|
+
lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
|
|
33
|
+
tqdm = { version = "^4.0.0", optional = true }
|
|
34
|
+
pandas = { version = "^2.0.0", optional = true }
|
|
35
|
+
pyarrow = { version = "^19.0.0", optional = true }
|
|
36
|
+
pytest = { version = "^8.0.0", optional = true }
|
|
37
|
+
playwright = { version = "1.47.0", optional = true }
|
|
38
|
+
tf-playwright-stealth = { version = "^1.0.0", optional = true }
|
|
39
|
+
soda-core-postgres = { version = "^3.0.0", optional = true }
|
|
40
|
+
soda-core-mysql = { version = "^3.0.0", optional = true }
|
|
41
|
+
fake-useragent = { version = "^2.0.0", optional = true }
|
|
42
|
+
pydrive2 = { version = "^1.0.0", optional = true }
|
|
43
|
+
clickhouse-driver = { version = "~0.2.0", optional = true }
|
|
44
|
+
stem = { version = "^1.0.0", optional = true }
|
|
45
|
+
click = { version = "^8.0.0", optional = true }
|
|
46
|
+
rapidfuzz = { version = "^3.0.0", optional = true }
|
|
47
|
+
demjson3 = { version = "^3.0.0", optional = true }
|
|
48
|
+
geopy = { version = "^2.0.0", optional = true }
|
|
49
|
+
nodriver = { version = "0.38.post1", optional = true }
|
|
50
|
+
retry = { version = "~0.9.0", optional = true }
|
|
51
|
+
shapely = { version = "^2.0.0", optional = true }
|
|
52
|
+
geopandas = { version = "^1.0.0", optional = true }
|
|
53
|
+
chompjs = { version = "^1.0.0", optional = true }
|
|
54
|
+
pillow = { version = "^11.0.0", optional = true }
|
|
55
|
+
duckduckgo-search = { version = "^7.0.0", optional = true }
|
|
56
|
+
pysocks = { version = "^1.0.0", optional = true }
|
|
57
|
+
xmltodict = { version = "~0.14.0", optional = true }
|
|
58
|
+
pymupdf = { version = "^1.0.0", optional = true }
|
|
59
|
+
html2text = { version = "^2024.0.0", optional = true }
|
|
60
|
+
pyspark = { version = "^3.0.0", optional = true }
|
|
61
|
+
pandera = { version = "~0.22.0", optional = true }
|
|
62
|
+
json5 = { version = "~0.10.0", optional = true }
|
|
63
|
+
geoalchemy2 = { version = "~0.17.0", optional = true }
|
|
64
|
+
datetime = { version = "^5.0", optional = true }
|
|
65
|
+
azure-storage-blob = { version = "^12.0.0", optional = true }
|
|
66
|
+
google-api-python-client = { version = "^2.0.0", optional = true }
|
|
67
|
+
google-auth-httplib2 = { version = "~0.2.0", optional = true }
|
|
68
|
+
google-auth-oauthlib = { version = "^1.0.0", optional = true }
|
|
69
|
+
dnspython = { version = "^2.0.0", optional = true }
|
|
70
|
+
openpyxl = { version = "^3.0.0", optional = true }
|
|
71
|
+
httpx = { extras = ["http2"], version = "~0.28.0", optional = true }
|
|
72
|
+
SQLAlchemy = { version = "^2.0.0", optional = true }
|
|
73
|
+
|
|
74
|
+
[tool.poetry.extras]
|
|
75
|
+
boto3 = ["boto3"]
|
|
76
|
+
unidecode = ["unidecode"]
|
|
77
|
+
lxml = ["lxml"]
|
|
78
|
+
tqdm = ["tqdm"]
|
|
79
|
+
pandas = ["pandas"]
|
|
80
|
+
pyarrow = ["pyarrow"]
|
|
81
|
+
pytest = ["pytest"]
|
|
82
|
+
playwright = ["playwright"]
|
|
83
|
+
playwright-stealth = ["playwright-stealth"]
|
|
84
|
+
soda-core-postgres = ["soda-core-postgres"]
|
|
85
|
+
soda-core-mysql = ["soda-core-mysql"]
|
|
86
|
+
fake-useragent = ["fake-useragent"]
|
|
87
|
+
pydrive2 = ["pydrive2"]
|
|
88
|
+
clickhouse-driver = ["clickhouse-driver"]
|
|
89
|
+
stem = ["stem"]
|
|
90
|
+
click = ["click"]
|
|
91
|
+
rapidfuzz = ["rapidfuzz"]
|
|
92
|
+
demjson3 = ["demjson3"]
|
|
93
|
+
geopy = ["geopy"]
|
|
94
|
+
nodriver = ["nodriver"]
|
|
95
|
+
undetected-chromedriver = ["undetected-chromedriver"]
|
|
96
|
+
retry = ["retry"]
|
|
97
|
+
shapely = ["shapely"]
|
|
98
|
+
geopandas = ["geopandas"]
|
|
99
|
+
chompjs = ["chompjs"]
|
|
100
|
+
pillow = ["pillow"]
|
|
101
|
+
duckduckgo-search = ["duckduckgo-search"]
|
|
102
|
+
pysocks = ["pysocks"]
|
|
103
|
+
xmltodict = ["xmltodict"]
|
|
104
|
+
pymupdf = ["pymupdf"]
|
|
105
|
+
html2text = ["html2text"]
|
|
106
|
+
pyspark = ["pyspark"]
|
|
107
|
+
pandera = ["pandera"]
|
|
108
|
+
json5 = ["json5"]
|
|
109
|
+
geoalchemy2 = ["geoalchemy2"]
|
|
110
|
+
datetime = ["datetime"]
|
|
111
|
+
azure-storage-blob = ["azure-storage-blob"]
|
|
112
|
+
google-api-python-client = ["google-api-python-client"]
|
|
113
|
+
google-auth-httplib2 = ["google-auth-httplib2"]
|
|
114
|
+
google-auth-oauthlib = ["google-auth-oauthlib"]
|
|
115
|
+
dnspython = ["dnspython"]
|
|
116
|
+
openpyxl = ["openpyxl"]
|
|
117
|
+
httpx = ["httpx"]
|
|
118
|
+
|
|
119
|
+
# Interface groups
|
|
120
|
+
aws = ["boto3"]
|
|
121
|
+
drive = ["pydrive2"]
|
|
122
|
+
peerdb = ["boto3", "clickhouse-driver"]
|
|
123
|
+
proxy = ["stem"]
|
|
124
|
+
alchemy = ["SQLAlchemy"]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
[build-system]
|
|
128
|
+
requires = ["poetry-core>=1.0.0"]
|
|
129
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -5,6 +5,8 @@ import io
|
|
|
5
5
|
import logging
|
|
6
6
|
import boto3
|
|
7
7
|
|
|
8
|
+
from ..utils.main import Config
|
|
9
|
+
|
|
8
10
|
########################################################################################################################
|
|
9
11
|
# CLASSES
|
|
10
12
|
|
|
@@ -12,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class AWSInterface:
|
|
15
|
-
def __init__(self, config):
|
|
17
|
+
def __init__(self, config: Config) -> None:
|
|
16
18
|
self.profiles = []
|
|
17
19
|
self.config = config
|
|
18
20
|
|
|
@@ -31,13 +33,13 @@ class AWSInterface:
|
|
|
31
33
|
self.current_profile = self.profiles[0] if self.profiles else None
|
|
32
34
|
self._update_resources()
|
|
33
35
|
|
|
34
|
-
def _update_resources(self):
|
|
36
|
+
def _update_resources(self) -> None:
|
|
35
37
|
if self.current_profile:
|
|
36
38
|
self.s3 = self.current_profile["session"].resource("s3")
|
|
37
39
|
self.s3_client = self.s3.meta.client
|
|
38
40
|
self.bucket = self.current_profile["buckets"][0]
|
|
39
41
|
|
|
40
|
-
def switch_profile(self, profile_name: str):
|
|
42
|
+
def switch_profile(self, profile_name: str) -> None:
|
|
41
43
|
for profile in self.profiles:
|
|
42
44
|
if profile["profile"] == profile_name:
|
|
43
45
|
self.current_profile = profile
|
|
@@ -45,7 +47,7 @@ class AWSInterface:
|
|
|
45
47
|
return
|
|
46
48
|
logger.warning(f"Profile {profile_name} not found")
|
|
47
49
|
|
|
48
|
-
def switch_bucket(self, bucket: str):
|
|
50
|
+
def switch_bucket(self, bucket: str) -> None:
|
|
49
51
|
if bucket not in self.current_profile["buckets"]:
|
|
50
52
|
logger.warning(
|
|
51
53
|
f"Bucket {bucket} not found in profile {self.current_profile['profile']}"
|
|
@@ -54,14 +56,14 @@ class AWSInterface:
|
|
|
54
56
|
|
|
55
57
|
self.bucket = bucket
|
|
56
58
|
|
|
57
|
-
def get_file(self, s3_path: str):
|
|
59
|
+
def get_file(self, s3_path: str) -> None:
|
|
58
60
|
try:
|
|
59
61
|
return self.s3.Object(self.bucket, s3_path).get()
|
|
60
62
|
except self.s3_client.exceptions.NoSuchKey:
|
|
61
63
|
logger.info(f"{s3_path} does not exist")
|
|
62
64
|
|
|
63
|
-
def read_file_as_bytes(self, s3_path: str):
|
|
65
|
+
def read_file_as_bytes(self, s3_path: str) -> io.BytesIO:
|
|
64
66
|
return io.BytesIO(self.get_file(s3_path)["Body"].read())
|
|
65
67
|
|
|
66
|
-
def upload_file(self, local_path: str, s3_path: str):
|
|
68
|
+
def upload_file(self, local_path: str, s3_path: str) -> None:
|
|
67
69
|
self.s3.Bucket(self.bucket).upload_file(local_path, s3_path)
|
|
@@ -5,6 +5,8 @@ import logging
|
|
|
5
5
|
from ftplib import FTP, FTP_TLS
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
+
from ..utils.main import Config
|
|
9
|
+
|
|
8
10
|
########################################################################################################################
|
|
9
11
|
# CLASSES
|
|
10
12
|
|
|
@@ -12,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class FTPInterface:
|
|
15
|
-
def __init__(self, config):
|
|
17
|
+
def __init__(self, config: Config):
|
|
16
18
|
if "ftp" in config:
|
|
17
19
|
self.config = config["ftp"]
|
|
18
20
|
|
|
@@ -216,52 +216,26 @@ class TransientS3:
|
|
|
216
216
|
self.config = section
|
|
217
217
|
self.bucket_name = self.config["bucket"]
|
|
218
218
|
self.session = boto3.Session(profile_name=self.config["profile"])
|
|
219
|
-
self.
|
|
219
|
+
self.s3_resource = self.session.resource("s3")
|
|
220
220
|
self.credentials = self.session.get_credentials()
|
|
221
221
|
self.access_key = self.credentials.access_key
|
|
222
222
|
self.secret_key = self.credentials.secret_key
|
|
223
223
|
self.region_name = self.session.region_name
|
|
224
|
-
self.endpoint_url = self.
|
|
224
|
+
self.endpoint_url = self.s3_resource.meta.endpoint_url
|
|
225
225
|
else:
|
|
226
226
|
logger.warning("no peerdb.s3 section in config")
|
|
227
227
|
|
|
228
228
|
def delete_paths_with_schema(self, schema_name):
|
|
229
229
|
logger.info(f"Deleting paths containing '{schema_name}' from S3")
|
|
230
230
|
|
|
231
|
-
|
|
232
|
-
pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
|
|
231
|
+
bucket = self.s3_resource.Bucket(self.bucket_name)
|
|
233
232
|
|
|
234
|
-
for
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
folder = prefix["Prefix"]
|
|
238
|
-
if schema_name in folder:
|
|
239
|
-
self._delete_folder_contents(folder)
|
|
233
|
+
for prefix in [schema_name, f"clone_{schema_name}"]:
|
|
234
|
+
objects_to_delete = bucket.objects.filter(Prefix=prefix)
|
|
235
|
+
objects_to_delete.delete()
|
|
240
236
|
|
|
241
237
|
logger.info(f"Deleted paths containing '{schema_name}' from S3")
|
|
242
238
|
|
|
243
|
-
def _delete_folder_contents(self, folder):
|
|
244
|
-
logger.info(f"Deleting contents of folder: {folder}")
|
|
245
|
-
|
|
246
|
-
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
247
|
-
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
|
|
248
|
-
|
|
249
|
-
delete_us = dict(Objects=[])
|
|
250
|
-
for page in pages:
|
|
251
|
-
if "Contents" in page:
|
|
252
|
-
for obj in page["Contents"]:
|
|
253
|
-
delete_us["Objects"].append(dict(Key=obj["Key"]))
|
|
254
|
-
|
|
255
|
-
# AWS limits to deleting 1000 objects at a time
|
|
256
|
-
if len(delete_us["Objects"]) >= 1000:
|
|
257
|
-
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
258
|
-
delete_us = dict(Objects=[])
|
|
259
|
-
|
|
260
|
-
if len(delete_us["Objects"]):
|
|
261
|
-
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
262
|
-
|
|
263
|
-
logger.info(f"Deleted contents of folder: {folder}")
|
|
264
|
-
|
|
265
239
|
|
|
266
240
|
class PeerDBInterface:
|
|
267
241
|
def __init__(self, config):
|
|
@@ -308,11 +282,11 @@ class PeerDBInterface:
|
|
|
308
282
|
if not self.docker_host_mapping or not host:
|
|
309
283
|
return host
|
|
310
284
|
|
|
311
|
-
if host in [
|
|
285
|
+
if host in ["localhost", "127.0.0.1"]:
|
|
312
286
|
logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
|
|
313
287
|
return self.docker_host_mapping
|
|
314
288
|
|
|
315
|
-
url_pattern = r
|
|
289
|
+
url_pattern = r"(localhost|127\.0\.0\.1)"
|
|
316
290
|
match = re.search(url_pattern, host)
|
|
317
291
|
if match:
|
|
318
292
|
original_host = match.group(1)
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
-
import configparser
|
|
6
5
|
import inspect
|
|
7
6
|
import logging
|
|
8
7
|
import random
|
|
@@ -16,6 +15,7 @@ from typing import Literal, Union
|
|
|
16
15
|
|
|
17
16
|
import pendulum
|
|
18
17
|
from croniter import croniter
|
|
18
|
+
from configparser import RawConfigParser
|
|
19
19
|
from dynaconf import Dynaconf, add_converter
|
|
20
20
|
|
|
21
21
|
########################################################################################################################
|
|
@@ -23,6 +23,8 @@ from dynaconf import Dynaconf, add_converter
|
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
|
+
Config = Union[RawConfigParser, Dynaconf]
|
|
27
|
+
|
|
26
28
|
|
|
27
29
|
def get_granular_date(
|
|
28
30
|
granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str], tz: str = "Europe/Madrid"
|
|
@@ -64,16 +66,20 @@ def read_converter(path_str: str):
|
|
|
64
66
|
return f.read()
|
|
65
67
|
|
|
66
68
|
|
|
67
|
-
def get_config(
|
|
69
|
+
def get_config(
|
|
70
|
+
config_file: Path, tz: str = "Europe/Madrid"
|
|
71
|
+
) -> Union[RawConfigParser, Dynaconf]:
|
|
68
72
|
if Path(config_file).suffix == ".ini":
|
|
69
73
|
logger.warning("Using legacy INI config reader. Please migrate to TOML")
|
|
70
|
-
cfg =
|
|
74
|
+
cfg = RawConfigParser()
|
|
71
75
|
cfg.read(config_file)
|
|
72
76
|
return cfg
|
|
73
77
|
|
|
74
78
|
add_converter("read", read_converter)
|
|
75
79
|
|
|
76
80
|
dt_now = get_granular_date("now", tz)
|
|
81
|
+
dt_weekly = get_granular_date("weekly", tz)
|
|
82
|
+
dt_biweekly = get_granular_date("biweekly", tz)
|
|
77
83
|
|
|
78
84
|
config = Dynaconf(
|
|
79
85
|
environments=True,
|
|
@@ -84,14 +90,17 @@ def get_config(config_file: Path, tz: str = "Europe/Madrid"):
|
|
|
84
90
|
config.load_file(path=Path.home() / config_file.name)
|
|
85
91
|
|
|
86
92
|
config.vars = {
|
|
87
|
-
"now": dt_now.strftime("%Y-%m-%d %H:%M:%S"),
|
|
88
|
-
"today": dt_now.strftime("%Y-%m-%d"),
|
|
89
93
|
"year": dt_now.strftime("%Y"),
|
|
90
94
|
"month": dt_now.strftime("%m"),
|
|
91
95
|
"day": dt_now.strftime("%d"),
|
|
92
|
-
"
|
|
93
|
-
"today_stripped": dt_now.strftime("%Y%m%d"),
|
|
96
|
+
"now": dt_now.strftime("%Y-%m-%d %H:%M:%S"),
|
|
94
97
|
"now_stripped": dt_now.strftime("%Y%m%d%H%M%S"),
|
|
98
|
+
"today": dt_now.strftime("%Y-%m-%d"),
|
|
99
|
+
"today_stripped": dt_now.strftime("%Y%m%d"),
|
|
100
|
+
"weekly_date": dt_weekly.strftime("%Y-%m-%d"),
|
|
101
|
+
"weekly_date_stripped": dt_weekly.strftime("%Y%m%d"),
|
|
102
|
+
"biweekly_date": dt_biweekly.strftime("%Y-%m-%d"),
|
|
103
|
+
"biweekly_date_stripped": dt_biweekly.strftime("%Y%m%d"),
|
|
95
104
|
"dynaconf_merge": True,
|
|
96
105
|
}
|
|
97
106
|
|
datamarket-0.9.2/PKG-INFO
DELETED
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: datamarket
|
|
3
|
-
Version: 0.9.2
|
|
4
|
-
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
-
License: GPL-3.0-or-later
|
|
6
|
-
Author: DataMarket
|
|
7
|
-
Author-email: techsupport@datamarket.es
|
|
8
|
-
Requires-Python: >=3.9,<4.0
|
|
9
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
-
Provides-Extra: alchemy
|
|
19
|
-
Provides-Extra: aws
|
|
20
|
-
Provides-Extra: azure-storage-blob
|
|
21
|
-
Provides-Extra: boto3
|
|
22
|
-
Provides-Extra: chompjs
|
|
23
|
-
Provides-Extra: click
|
|
24
|
-
Provides-Extra: clickhouse-driver
|
|
25
|
-
Provides-Extra: datetime
|
|
26
|
-
Provides-Extra: demjson3
|
|
27
|
-
Provides-Extra: dnspython
|
|
28
|
-
Provides-Extra: drive
|
|
29
|
-
Provides-Extra: duckduckgo-search
|
|
30
|
-
Provides-Extra: fake-useragent
|
|
31
|
-
Provides-Extra: geoalchemy2
|
|
32
|
-
Provides-Extra: geopandas
|
|
33
|
-
Provides-Extra: geopy
|
|
34
|
-
Provides-Extra: google-api-python-client
|
|
35
|
-
Provides-Extra: google-auth-httplib2
|
|
36
|
-
Provides-Extra: google-auth-oauthlib
|
|
37
|
-
Provides-Extra: html2text
|
|
38
|
-
Provides-Extra: httpx
|
|
39
|
-
Provides-Extra: json5
|
|
40
|
-
Provides-Extra: lxml
|
|
41
|
-
Provides-Extra: nodriver
|
|
42
|
-
Provides-Extra: openpyxl
|
|
43
|
-
Provides-Extra: pandas
|
|
44
|
-
Provides-Extra: pandera
|
|
45
|
-
Provides-Extra: peerdb
|
|
46
|
-
Provides-Extra: pillow
|
|
47
|
-
Provides-Extra: playwright
|
|
48
|
-
Provides-Extra: playwright-stealth
|
|
49
|
-
Provides-Extra: proxy
|
|
50
|
-
Provides-Extra: pyarrow
|
|
51
|
-
Provides-Extra: pydrive2
|
|
52
|
-
Provides-Extra: pymupdf
|
|
53
|
-
Provides-Extra: pysocks
|
|
54
|
-
Provides-Extra: pyspark
|
|
55
|
-
Provides-Extra: pytest
|
|
56
|
-
Provides-Extra: rapidfuzz
|
|
57
|
-
Provides-Extra: retry
|
|
58
|
-
Provides-Extra: shapely
|
|
59
|
-
Provides-Extra: soda-core-mysql
|
|
60
|
-
Provides-Extra: soda-core-postgres
|
|
61
|
-
Provides-Extra: stem
|
|
62
|
-
Provides-Extra: tqdm
|
|
63
|
-
Provides-Extra: undetected-chromedriver
|
|
64
|
-
Provides-Extra: unidecode
|
|
65
|
-
Provides-Extra: xmltodict
|
|
66
|
-
Requires-Dist: SQLAlchemy (==2.0.36) ; extra == "alchemy"
|
|
67
|
-
Requires-Dist: azure-storage-blob (==12.23.1) ; extra == "azure-storage-blob"
|
|
68
|
-
Requires-Dist: beautifulsoup4 (==4.12.3)
|
|
69
|
-
Requires-Dist: boto3 (==1.35.53) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
70
|
-
Requires-Dist: chompjs (==1.3.0) ; extra == "chompjs"
|
|
71
|
-
Requires-Dist: click (==8.1.7) ; extra == "click"
|
|
72
|
-
Requires-Dist: clickhouse-driver (==0.2.9) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
73
|
-
Requires-Dist: croniter (==3.0.4)
|
|
74
|
-
Requires-Dist: datetime (==5.5) ; extra == "datetime"
|
|
75
|
-
Requires-Dist: demjson3 (==3.0.6) ; extra == "demjson3"
|
|
76
|
-
Requires-Dist: dnspython (==2.7.0) ; extra == "dnspython"
|
|
77
|
-
Requires-Dist: duckduckgo-search (==6.2.11b1) ; extra == "duckduckgo-search"
|
|
78
|
-
Requires-Dist: dynaconf (==3.2.6)
|
|
79
|
-
Requires-Dist: fake-useragent (==1.5.1) ; extra == "fake-useragent"
|
|
80
|
-
Requires-Dist: geoalchemy2 (==0.15.2) ; extra == "geoalchemy2"
|
|
81
|
-
Requires-Dist: geopandas (==1.0.1) ; extra == "geopandas"
|
|
82
|
-
Requires-Dist: geopy (==2.4.1) ; extra == "geopy"
|
|
83
|
-
Requires-Dist: google-api-python-client (==2.151.0) ; extra == "google-api-python-client"
|
|
84
|
-
Requires-Dist: google-auth-httplib2 (==0.2.0) ; extra == "google-auth-httplib2"
|
|
85
|
-
Requires-Dist: google-auth-oauthlib (==1.2.1) ; extra == "google-auth-oauthlib"
|
|
86
|
-
Requires-Dist: html2text (==2024.2.26) ; extra == "html2text"
|
|
87
|
-
Requires-Dist: httpx[http2] (==0.28.1) ; extra == "httpx"
|
|
88
|
-
Requires-Dist: jinja2 (==3.1.5)
|
|
89
|
-
Requires-Dist: json5 (==0.9.25) ; extra == "json5"
|
|
90
|
-
Requires-Dist: lxml[html-clean] (==5.3.0) ; extra == "lxml"
|
|
91
|
-
Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
|
|
92
|
-
Requires-Dist: openpyxl (==3.1.5) ; extra == "openpyxl"
|
|
93
|
-
Requires-Dist: pandas (==2.2.3) ; extra == "pandas"
|
|
94
|
-
Requires-Dist: pandera (==0.20.4) ; extra == "pandera"
|
|
95
|
-
Requires-Dist: pendulum (==3.0.0)
|
|
96
|
-
Requires-Dist: pillow (==11.0.0) ; extra == "pillow"
|
|
97
|
-
Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
|
|
98
|
-
Requires-Dist: playwright-stealth (==1.0.6) ; extra == "playwright-stealth"
|
|
99
|
-
Requires-Dist: pre-commit (==4.0.1)
|
|
100
|
-
Requires-Dist: psycopg2-binary (==2.9.10)
|
|
101
|
-
Requires-Dist: pyarrow (==17.0.0) ; extra == "pyarrow"
|
|
102
|
-
Requires-Dist: pydrive2 (==1.20.0) ; extra == "pydrive2" or extra == "drive"
|
|
103
|
-
Requires-Dist: pymupdf (==1.24.13) ; extra == "pymupdf"
|
|
104
|
-
Requires-Dist: pysocks (==1.7.1) ; extra == "pysocks"
|
|
105
|
-
Requires-Dist: pyspark (==3.5.3) ; extra == "pyspark"
|
|
106
|
-
Requires-Dist: pytest (==8.3.3) ; extra == "pytest"
|
|
107
|
-
Requires-Dist: rapidfuzz (==3.10.1) ; extra == "rapidfuzz"
|
|
108
|
-
Requires-Dist: requests (==2.32.3)
|
|
109
|
-
Requires-Dist: retry (==0.9.2) ; extra == "retry"
|
|
110
|
-
Requires-Dist: shapely (==2.0.6) ; extra == "shapely"
|
|
111
|
-
Requires-Dist: soda-core-mysql (==3.4.4) ; extra == "soda-core-mysql"
|
|
112
|
-
Requires-Dist: soda-core-postgres (==3.4.1) ; extra == "soda-core-postgres"
|
|
113
|
-
Requires-Dist: stem (==1.8.2) ; extra == "stem" or extra == "proxy"
|
|
114
|
-
Requires-Dist: tenacity (==9.0.0)
|
|
115
|
-
Requires-Dist: tqdm (==4.66.6) ; extra == "tqdm"
|
|
116
|
-
Requires-Dist: typer (==0.12.5)
|
|
117
|
-
Requires-Dist: undetected-chromedriver (==3.5.5) ; extra == "undetected-chromedriver"
|
|
118
|
-
Requires-Dist: unidecode (==1.3.8) ; extra == "unidecode"
|
|
119
|
-
Requires-Dist: xmltodict (==0.14.2) ; extra == "xmltodict"
|
|
120
|
-
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
121
|
-
Project-URL: Homepage, https://datamarket.es
|
|
122
|
-
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
123
|
-
Description-Content-Type: text/markdown
|
|
124
|
-
|
|
125
|
-
# DataMarket scraping core
|
|
126
|
-
|
|
127
|
-
------------------------------------------------------
|
|
128
|
-
[](https://github.com/psf/black)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
Utilities that integrate advance scraping knowledge into just one library.
|
|
132
|
-
|
|
133
|
-
## Installation
|
|
134
|
-
|
|
135
|
-
To install this library in your Python environment:
|
|
136
|
-
|
|
137
|
-
`pip install datamarket`
|
|
138
|
-
|
|
139
|
-
## Documentation
|
|
140
|
-
|
|
141
|
-
This library has built functionalities for the following topics:
|
|
142
|
-
|
|
143
|
-
- **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
|
|
144
|
-
- **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
|
|
145
|
-
- **Tinybird**: a Python client for this popular API.
|
|
146
|
-
- **Drive**: functions to upload, delete or authenticate to Google Drive.
|
|
147
|
-
- **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
|
|
148
|
-
- **Selenium**: wrapper for the main Selenium functions.
|
|
149
|
-
|
datamarket-0.9.2/pyproject.toml
DELETED
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
[tool.poetry]
|
|
2
|
-
name = "datamarket"
|
|
3
|
-
version = "0.9.2"
|
|
4
|
-
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
|
-
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
|
-
license = "GPL-3.0-or-later"
|
|
7
|
-
readme = "README.md"
|
|
8
|
-
homepage = "https://datamarket.es"
|
|
9
|
-
repository = "https://github.com/Data-Market/datamarket"
|
|
10
|
-
documentation = "https://github.com/Data-Market/datamarket"
|
|
11
|
-
classifiers = [
|
|
12
|
-
"Programming Language :: Python :: 3",
|
|
13
|
-
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
14
|
-
"Operating System :: OS Independent",
|
|
15
|
-
]
|
|
16
|
-
|
|
17
|
-
[tool.poetry.dependencies]
|
|
18
|
-
python = "^3.9"
|
|
19
|
-
typer = "0.12.5"
|
|
20
|
-
psycopg2-binary = "2.9.10"
|
|
21
|
-
requests = "2.32.3"
|
|
22
|
-
tenacity = "9.0.0"
|
|
23
|
-
beautifulsoup4 = "4.12.3"
|
|
24
|
-
pre-commit = "4.0.1"
|
|
25
|
-
pendulum = "3.0.0"
|
|
26
|
-
croniter = "3.0.4"
|
|
27
|
-
dynaconf = "3.2.6"
|
|
28
|
-
jinja2 = "3.1.5"
|
|
29
|
-
|
|
30
|
-
boto3 = { version = "1.35.53", optional = true }
|
|
31
|
-
unidecode = { version = "1.3.8", optional = true }
|
|
32
|
-
lxml = { extras = ["html-clean"], version = "5.3.0", optional = true }
|
|
33
|
-
tqdm = { version = "4.66.6", optional = true }
|
|
34
|
-
pandas = { version = "2.2.3", optional = true }
|
|
35
|
-
pyarrow = { version = "17.0.0", optional = true }
|
|
36
|
-
pytest = { version = "8.3.3", optional = true }
|
|
37
|
-
playwright = { version = "1.47.0", optional = true }
|
|
38
|
-
playwright-stealth = { version = "1.0.6", optional = true }
|
|
39
|
-
soda-core-postgres = { version = "3.4.1", optional = true }
|
|
40
|
-
soda-core-mysql = { version = "3.4.4", optional = true }
|
|
41
|
-
fake-useragent = { version = "1.5.1", optional = true }
|
|
42
|
-
pydrive2 = { version = "1.20.0", optional = true }
|
|
43
|
-
clickhouse-driver = { version = "0.2.9", optional = true }
|
|
44
|
-
stem = { version = "1.8.2", optional = true }
|
|
45
|
-
click = { version = "8.1.7", optional = true }
|
|
46
|
-
rapidfuzz = { version = "3.10.1", optional = true }
|
|
47
|
-
demjson3 = { version = "3.0.6", optional = true }
|
|
48
|
-
geopy = { version = "2.4.1", optional = true }
|
|
49
|
-
nodriver = { version = "0.38.post1", optional = true }
|
|
50
|
-
undetected-chromedriver = { version = "3.5.5", optional = true }
|
|
51
|
-
retry = { version = "0.9.2", optional = true }
|
|
52
|
-
shapely = { version = "2.0.6", optional = true }
|
|
53
|
-
geopandas = { version = "1.0.1", optional = true }
|
|
54
|
-
chompjs = { version = "1.3.0", optional = true }
|
|
55
|
-
pillow = { version = "11.0.0", optional = true }
|
|
56
|
-
duckduckgo-search = { version = "6.2.11b1", optional = true }
|
|
57
|
-
pysocks = { version = "1.7.1", optional = true }
|
|
58
|
-
xmltodict = { version = "0.14.2", optional = true }
|
|
59
|
-
pymupdf = { version = "1.24.13", optional = true }
|
|
60
|
-
html2text = { version = "2024.2.26", optional = true }
|
|
61
|
-
pyspark = { version = "3.5.3", optional = true }
|
|
62
|
-
pandera = { version = "0.20.4", optional = true }
|
|
63
|
-
json5 = { version = "0.9.25", optional = true }
|
|
64
|
-
geoalchemy2 = { version = "0.15.2", optional = true }
|
|
65
|
-
datetime = { version = "5.5", optional = true }
|
|
66
|
-
azure-storage-blob = { version = "12.23.1", optional = true }
|
|
67
|
-
google-api-python-client = { version = "2.151.0", optional = true }
|
|
68
|
-
google-auth-httplib2 = { version = "0.2.0", optional = true }
|
|
69
|
-
google-auth-oauthlib = { version = "1.2.1", optional = true }
|
|
70
|
-
dnspython = { version = "2.7.0", optional = true }
|
|
71
|
-
openpyxl = { version = "3.1.5", optional = true }
|
|
72
|
-
httpx = { extras = ["http2"], version = "0.28.1", optional = true }
|
|
73
|
-
SQLAlchemy = { version = "2.0.36", optional = true }
|
|
74
|
-
|
|
75
|
-
[tool.poetry.extras]
|
|
76
|
-
boto3 = ["boto3"]
|
|
77
|
-
unidecode = ["unidecode"]
|
|
78
|
-
lxml = ["lxml"]
|
|
79
|
-
tqdm = ["tqdm"]
|
|
80
|
-
pandas = ["pandas"]
|
|
81
|
-
pyarrow = ["pyarrow"]
|
|
82
|
-
pytest = ["pytest"]
|
|
83
|
-
playwright = ["playwright"]
|
|
84
|
-
playwright-stealth = ["playwright-stealth"]
|
|
85
|
-
soda-core-postgres = ["soda-core-postgres"]
|
|
86
|
-
soda-core-mysql = ["soda-core-mysql"]
|
|
87
|
-
fake-useragent = ["fake-useragent"]
|
|
88
|
-
pydrive2 = ["pydrive2"]
|
|
89
|
-
clickhouse-driver = ["clickhouse-driver"]
|
|
90
|
-
stem = ["stem"]
|
|
91
|
-
click = ["click"]
|
|
92
|
-
rapidfuzz = ["rapidfuzz"]
|
|
93
|
-
demjson3 = ["demjson3"]
|
|
94
|
-
geopy = ["geopy"]
|
|
95
|
-
nodriver = ["nodriver"]
|
|
96
|
-
undetected-chromedriver = ["undetected-chromedriver"]
|
|
97
|
-
retry = ["retry"]
|
|
98
|
-
shapely = ["shapely"]
|
|
99
|
-
geopandas = ["geopandas"]
|
|
100
|
-
chompjs = ["chompjs"]
|
|
101
|
-
pillow = ["pillow"]
|
|
102
|
-
duckduckgo-search = ["duckduckgo-search"]
|
|
103
|
-
pysocks = ["pysocks"]
|
|
104
|
-
xmltodict = ["xmltodict"]
|
|
105
|
-
pymupdf = ["pymupdf"]
|
|
106
|
-
html2text = ["html2text"]
|
|
107
|
-
pyspark = ["pyspark"]
|
|
108
|
-
pandera = ["pandera"]
|
|
109
|
-
json5 = ["json5"]
|
|
110
|
-
geoalchemy2 = ["geoalchemy2"]
|
|
111
|
-
datetime = ["datetime"]
|
|
112
|
-
azure-storage-blob = ["azure-storage-blob"]
|
|
113
|
-
google-api-python-client = ["google-api-python-client"]
|
|
114
|
-
google-auth-httplib2 = ["google-auth-httplib2"]
|
|
115
|
-
google-auth-oauthlib = ["google-auth-oauthlib"]
|
|
116
|
-
dnspython = ["dnspython"]
|
|
117
|
-
openpyxl = ["openpyxl"]
|
|
118
|
-
httpx = ["httpx"]
|
|
119
|
-
|
|
120
|
-
# Interface groups
|
|
121
|
-
aws = ["boto3"]
|
|
122
|
-
drive = ["pydrive2"]
|
|
123
|
-
peerdb = ["boto3", "clickhouse-driver"]
|
|
124
|
-
proxy = ["stem"]
|
|
125
|
-
alchemy = ["SQLAlchemy"]
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
[build-system]
|
|
129
|
-
requires = ["poetry-core>=1.0.0"]
|
|
130
|
-
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|