ingestr 0.13.39__py3-none-any.whl → 0.13.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/factory.py +3 -0
- ingestr/src/sources.py +95 -1
- {ingestr-0.13.39.dist-info → ingestr-0.13.41.dist-info}/METADATA +8 -1
- {ingestr-0.13.39.dist-info → ingestr-0.13.41.dist-info}/RECORD +9 -8
- {ingestr-0.13.39.dist-info → ingestr-0.13.41.dist-info}/WHEEL +0 -0
- {ingestr-0.13.39.dist-info → ingestr-0.13.41.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.39.dist-info → ingestr-0.13.41.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/buildinfo.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
version = "v0.13.
|
|
1
|
+
version = "v0.13.41"
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import dlt
|
|
5
|
+
import pendulum
|
|
6
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
7
|
+
from pendulum import parse
|
|
8
|
+
|
|
9
|
+
from elasticsearch import Elasticsearch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dlt.source
|
|
13
|
+
def elasticsearch_source(
|
|
14
|
+
connection_url: str,
|
|
15
|
+
index: str,
|
|
16
|
+
verify_certs: bool,
|
|
17
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
18
|
+
):
|
|
19
|
+
client = Elasticsearch(connection_url, verify_certs=verify_certs)
|
|
20
|
+
|
|
21
|
+
@dlt.resource(
|
|
22
|
+
name=index, primary_key="id", write_disposition="merge", incremental=incremental
|
|
23
|
+
)
|
|
24
|
+
def get_documents(incremental=incremental):
|
|
25
|
+
body = {"query": {"match_all": {}}}
|
|
26
|
+
|
|
27
|
+
if incremental:
|
|
28
|
+
start_value = incremental.last_value
|
|
29
|
+
range_filter = {"gte": start_value}
|
|
30
|
+
if incremental.end_value is not None:
|
|
31
|
+
range_filter["lt"] = incremental.end_value
|
|
32
|
+
body = {"query": {"range": {incremental.cursor_path: range_filter}}}
|
|
33
|
+
|
|
34
|
+
page = client.search(index=index, scroll="5m", size=5, body=body)
|
|
35
|
+
|
|
36
|
+
sid = page["_scroll_id"]
|
|
37
|
+
hits = page["hits"]["hits"]
|
|
38
|
+
|
|
39
|
+
if not hits:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# fetching first page (via .search)
|
|
43
|
+
for doc in hits:
|
|
44
|
+
doc_data = {"id": doc["_id"], **doc["_source"]}
|
|
45
|
+
if incremental:
|
|
46
|
+
doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
|
|
47
|
+
doc_data[incremental.cursor_path]
|
|
48
|
+
)
|
|
49
|
+
yield doc_data
|
|
50
|
+
|
|
51
|
+
while True:
|
|
52
|
+
# fetching page 2 and other pages (via .scroll)
|
|
53
|
+
page = client.scroll(scroll_id=sid, scroll="5m")
|
|
54
|
+
sid = page["_scroll_id"]
|
|
55
|
+
hits = page["hits"]["hits"]
|
|
56
|
+
if not hits:
|
|
57
|
+
break
|
|
58
|
+
for doc in hits:
|
|
59
|
+
doc_data = {"id": doc["_id"], **doc["_source"]}
|
|
60
|
+
if incremental:
|
|
61
|
+
doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
|
|
62
|
+
doc_data[incremental.cursor_path]
|
|
63
|
+
)
|
|
64
|
+
yield doc_data
|
|
65
|
+
|
|
66
|
+
client.clear_scroll(scroll_id=sid)
|
|
67
|
+
|
|
68
|
+
return get_documents
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def convert_elasticsearch_objs(value: Any) -> Any:
|
|
72
|
+
if isinstance(value, str):
|
|
73
|
+
parsed_date = parse(value, strict=False)
|
|
74
|
+
if parsed_date is not None:
|
|
75
|
+
if isinstance(
|
|
76
|
+
parsed_date,
|
|
77
|
+
(pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
|
|
78
|
+
):
|
|
79
|
+
return ensure_pendulum_datetime(parsed_date)
|
|
80
|
+
return value
|
ingestr/src/factory.py
CHANGED
|
@@ -28,6 +28,7 @@ from ingestr.src.sources import (
|
|
|
28
28
|
AsanaSource,
|
|
29
29
|
ChessSource,
|
|
30
30
|
DynamoDBSource,
|
|
31
|
+
ElasticsearchSource,
|
|
31
32
|
FacebookAdsSource,
|
|
32
33
|
FrankfurterSource,
|
|
33
34
|
FreshdeskSource,
|
|
@@ -78,6 +79,7 @@ SQL_SOURCE_SCHEMES = [
|
|
|
78
79
|
"clickhouse",
|
|
79
80
|
"databricks",
|
|
80
81
|
"db2",
|
|
82
|
+
"spanner",
|
|
81
83
|
]
|
|
82
84
|
|
|
83
85
|
|
|
@@ -153,6 +155,7 @@ class SourceDestinationFactory:
|
|
|
153
155
|
"frankfurter": FrankfurterSource,
|
|
154
156
|
"freshdesk": FreshdeskSource,
|
|
155
157
|
"phantombuster": PhantombusterSource,
|
|
158
|
+
"elasticsearch": ElasticsearchSource,
|
|
156
159
|
}
|
|
157
160
|
destinations: Dict[str, Type[DestinationProtocol]] = {
|
|
158
161
|
"bigquery": BigQueryDestination,
|
ingestr/src/sources.py
CHANGED
|
@@ -52,7 +52,10 @@ class SqlSource:
|
|
|
52
52
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
53
53
|
table_fields = TableDefinition(dataset="custom", table="custom")
|
|
54
54
|
if not table.startswith("query:"):
|
|
55
|
-
|
|
55
|
+
if uri.startswith("spanner://"):
|
|
56
|
+
table_fields = TableDefinition(dataset="", table=table)
|
|
57
|
+
else:
|
|
58
|
+
table_fields = table_string_to_dataclass(table)
|
|
56
59
|
|
|
57
60
|
incremental = None
|
|
58
61
|
if kwargs.get("incremental_key"):
|
|
@@ -113,6 +116,45 @@ class SqlSource:
|
|
|
113
116
|
if uri.startswith("db2://"):
|
|
114
117
|
uri = uri.replace("db2://", "db2+ibm_db://")
|
|
115
118
|
|
|
119
|
+
if uri.startswith("spanner://"):
|
|
120
|
+
parsed_uri = urlparse(uri)
|
|
121
|
+
query_params = parse_qs(parsed_uri.query)
|
|
122
|
+
|
|
123
|
+
project_id_param = query_params.get("project_id")
|
|
124
|
+
instance_id_param = query_params.get("instance_id")
|
|
125
|
+
database_param = query_params.get("database")
|
|
126
|
+
|
|
127
|
+
cred_path = query_params.get("credentials_path")
|
|
128
|
+
cred_base64 = query_params.get("credentials_base64")
|
|
129
|
+
|
|
130
|
+
if not project_id_param or not instance_id_param or not database_param:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"project_id, instance_id and database are required in the URI to get data from Google Spanner"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
project_id = project_id_param[0]
|
|
136
|
+
instance_id = instance_id_param[0]
|
|
137
|
+
database = database_param[0]
|
|
138
|
+
|
|
139
|
+
if not cred_path and not cred_base64:
|
|
140
|
+
raise ValueError(
|
|
141
|
+
"credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
|
|
142
|
+
)
|
|
143
|
+
if cred_path:
|
|
144
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path[0]
|
|
145
|
+
elif cred_base64:
|
|
146
|
+
credentials = json.loads(
|
|
147
|
+
base64.b64decode(cred_base64[0]).decode("utf-8")
|
|
148
|
+
)
|
|
149
|
+
temp = tempfile.NamedTemporaryFile(
|
|
150
|
+
mode="w", delete=False, suffix=".json"
|
|
151
|
+
)
|
|
152
|
+
json.dump(credentials, temp)
|
|
153
|
+
temp.close()
|
|
154
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp.name
|
|
155
|
+
|
|
156
|
+
uri = f"spanner+spanner:///projects/{project_id}/instances/{instance_id}/databases/{database}"
|
|
157
|
+
|
|
116
158
|
from dlt.common.libs.sql_alchemy import (
|
|
117
159
|
Engine,
|
|
118
160
|
MetaData,
|
|
@@ -2298,3 +2340,55 @@ class PhantombusterSource:
|
|
|
2298
2340
|
start_date=start_date,
|
|
2299
2341
|
end_date=end_date,
|
|
2300
2342
|
).with_resources(table_name)
|
|
2343
|
+
|
|
2344
|
+
|
|
2345
|
+
class ElasticsearchSource:
|
|
2346
|
+
def handles_incrementality(self) -> bool:
|
|
2347
|
+
return False
|
|
2348
|
+
|
|
2349
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2350
|
+
from ingestr.src.elasticsearch import elasticsearch_source
|
|
2351
|
+
|
|
2352
|
+
incremental = None
|
|
2353
|
+
if kwargs.get("incremental_key"):
|
|
2354
|
+
start_value = kwargs.get("interval_start")
|
|
2355
|
+
end_value = kwargs.get("interval_end")
|
|
2356
|
+
|
|
2357
|
+
incremental = dlt_incremental(
|
|
2358
|
+
kwargs.get("incremental_key", ""),
|
|
2359
|
+
initial_value=start_value,
|
|
2360
|
+
end_value=end_value,
|
|
2361
|
+
range_end="closed",
|
|
2362
|
+
range_start="closed",
|
|
2363
|
+
)
|
|
2364
|
+
|
|
2365
|
+
# elasticsearch://localhost:9200?secure=true&verify_certs=false
|
|
2366
|
+
parsed = urlparse(uri)
|
|
2367
|
+
|
|
2368
|
+
index = table
|
|
2369
|
+
if not index:
|
|
2370
|
+
raise ValueError(
|
|
2371
|
+
"Table name must be provided which is the index name in elasticsearch"
|
|
2372
|
+
)
|
|
2373
|
+
|
|
2374
|
+
query_params = parsed.query
|
|
2375
|
+
params = parse_qs(query_params)
|
|
2376
|
+
|
|
2377
|
+
secure = True
|
|
2378
|
+
if "secure" in params:
|
|
2379
|
+
secure = params["secure"][0].capitalize() == "True"
|
|
2380
|
+
|
|
2381
|
+
verify_certs = True
|
|
2382
|
+
if "verify_certs" in params:
|
|
2383
|
+
verify_certs = params["verify_certs"][0].capitalize() == "True"
|
|
2384
|
+
|
|
2385
|
+
scheme = "https" if secure else "http"
|
|
2386
|
+
netloc = parsed.netloc
|
|
2387
|
+
connection_url = f"{scheme}://{netloc}"
|
|
2388
|
+
|
|
2389
|
+
return elasticsearch_source(
|
|
2390
|
+
connection_url=connection_url,
|
|
2391
|
+
index=index,
|
|
2392
|
+
verify_certs=verify_certs,
|
|
2393
|
+
incremental=incremental,
|
|
2394
|
+
).with_resources(table)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.41
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -50,6 +50,8 @@ Requires-Dist: dlt==1.10.0
|
|
|
50
50
|
Requires-Dist: dnspython==2.7.0
|
|
51
51
|
Requires-Dist: duckdb-engine==0.17.0
|
|
52
52
|
Requires-Dist: duckdb==1.2.1
|
|
53
|
+
Requires-Dist: elastic-transport==8.17.1
|
|
54
|
+
Requires-Dist: elasticsearch==8.10.1
|
|
53
55
|
Requires-Dist: et-xmlfile==2.0.0
|
|
54
56
|
Requires-Dist: facebook-business==20.0.0
|
|
55
57
|
Requires-Dist: filelock==3.17.0
|
|
@@ -70,11 +72,14 @@ Requires-Dist: google-auth==2.38.0
|
|
|
70
72
|
Requires-Dist: google-cloud-bigquery-storage==2.24.0
|
|
71
73
|
Requires-Dist: google-cloud-bigquery==3.30.0
|
|
72
74
|
Requires-Dist: google-cloud-core==2.4.2
|
|
75
|
+
Requires-Dist: google-cloud-spanner==3.54.0
|
|
73
76
|
Requires-Dist: google-cloud-storage==3.1.0
|
|
74
77
|
Requires-Dist: google-crc32c==1.6.0
|
|
75
78
|
Requires-Dist: google-resumable-media==2.7.2
|
|
76
79
|
Requires-Dist: googleapis-common-protos==1.69.0
|
|
77
80
|
Requires-Dist: greenlet==3.2.2
|
|
81
|
+
Requires-Dist: grpc-google-iam-v1==0.14.2
|
|
82
|
+
Requires-Dist: grpc-interceptor==0.15.4
|
|
78
83
|
Requires-Dist: grpcio-status==1.62.3
|
|
79
84
|
Requires-Dist: grpcio==1.70.0
|
|
80
85
|
Requires-Dist: hdbcli==2.23.27
|
|
@@ -166,9 +171,11 @@ Requires-Dist: soupsieve==2.6
|
|
|
166
171
|
Requires-Dist: sqlalchemy-bigquery==1.12.1
|
|
167
172
|
Requires-Dist: sqlalchemy-hana==2.0.0
|
|
168
173
|
Requires-Dist: sqlalchemy-redshift==0.8.14
|
|
174
|
+
Requires-Dist: sqlalchemy-spanner==1.11.0
|
|
169
175
|
Requires-Dist: sqlalchemy2-stubs==0.0.2a38
|
|
170
176
|
Requires-Dist: sqlalchemy==1.4.52
|
|
171
177
|
Requires-Dist: sqlglot==26.12.1
|
|
178
|
+
Requires-Dist: sqlparse==0.5.3
|
|
172
179
|
Requires-Dist: stripe==10.7.0
|
|
173
180
|
Requires-Dist: tenacity==9.0.0
|
|
174
181
|
Requires-Dist: thrift==0.16.0
|
|
@@ -2,15 +2,15 @@ ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
|
|
|
2
2
|
ingestr/main.py,sha256=Pe_rzwcDRKIYa7baEVUAAPOHyqQbX29RUexMl0F_S1k,25273
|
|
3
3
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
4
4
|
ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
|
|
5
|
-
ingestr/src/buildinfo.py,sha256=
|
|
5
|
+
ingestr/src/buildinfo.py,sha256=AK7sGUNx6CPDKJOXeMexFRun9bjpyv2c9t4DII73Pes,21
|
|
6
6
|
ingestr/src/destinations.py,sha256=MctbeJUyNr0DRB0XYt2xAbEKkHZ40-nXXEOYCs4KuoE,15420
|
|
7
7
|
ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
|
|
8
|
-
ingestr/src/factory.py,sha256=
|
|
8
|
+
ingestr/src/factory.py,sha256=KJKIL9q7kU4oAVXy5o0wDwLAU0nG9y0xC8D7HzksYak,5597
|
|
9
9
|
ingestr/src/filters.py,sha256=C-_TIVkF_cxZBgG-Run2Oyn0TAhJgA8IWXZ-OPY3uek,1136
|
|
10
10
|
ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
|
|
11
11
|
ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
|
|
12
12
|
ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
|
|
13
|
-
ingestr/src/sources.py,sha256=
|
|
13
|
+
ingestr/src/sources.py,sha256=SWZAa6bokLurQRPtH7rxi8K-GSVLp_p9Ig1ArGRsxCo,82635
|
|
14
14
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
15
15
|
ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
|
|
16
16
|
ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
|
|
@@ -35,6 +35,7 @@ ingestr/src/chess/helpers.py,sha256=v1HTImOMjAF7AzZUPDIuHu00e7ut0o5y1kWcVYo4QZw,
|
|
|
35
35
|
ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k,158
|
|
36
36
|
ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
|
|
37
37
|
ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
|
|
38
|
+
ingestr/src/elasticsearch/__init__.py,sha256=m-q93HgUmTwGDUwHOjHawstWL06TC3WIX3H05szybrY,2556
|
|
38
39
|
ingestr/src/facebook_ads/__init__.py,sha256=reEpSr4BaKA1wO3qVgCH51gW-TgWkbJ_g24UIhJWbac,9286
|
|
39
40
|
ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
|
|
40
41
|
ingestr/src/facebook_ads/helpers.py,sha256=ZLbNHiKer5lPb4g3_435XeBJr57Wv0o1KTyBA1mQ100,9068
|
|
@@ -127,8 +128,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
|
|
|
127
128
|
ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
|
|
128
129
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
129
130
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
130
|
-
ingestr-0.13.
|
|
131
|
-
ingestr-0.13.
|
|
132
|
-
ingestr-0.13.
|
|
133
|
-
ingestr-0.13.
|
|
134
|
-
ingestr-0.13.
|
|
131
|
+
ingestr-0.13.41.dist-info/METADATA,sha256=UzGBs9s0Kr6R1xji_ULG5Tuc383Klx2AIzfyZdXLBp4,13852
|
|
132
|
+
ingestr-0.13.41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
133
|
+
ingestr-0.13.41.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
134
|
+
ingestr-0.13.41.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
135
|
+
ingestr-0.13.41.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|