ingestr 0.10.0rc5__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +20 -3
- ingestr/src/adjust/__init__.py +1 -1
- ingestr/src/destinations.py +61 -1
- ingestr/src/factory.py +2 -0
- ingestr/src/filters.py +0 -2
- ingestr/src/sources.py +11 -6
- ingestr/src/version.py +1 -1
- {ingestr-0.10.0rc5.dist-info → ingestr-0.10.2.dist-info}/METADATA +4 -2
- {ingestr-0.10.0rc5.dist-info → ingestr-0.10.2.dist-info}/RECORD +12 -12
- {ingestr-0.10.0rc5.dist-info → ingestr-0.10.2.dist-info}/WHEEL +0 -0
- {ingestr-0.10.0rc5.dist-info → ingestr-0.10.2.dist-info}/entry_points.txt +0 -0
- {ingestr-0.10.0rc5.dist-info → ingestr-0.10.2.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py
CHANGED
|
@@ -348,6 +348,14 @@ def ingest(
|
|
|
348
348
|
)
|
|
349
349
|
raise typer.Abort()
|
|
350
350
|
|
|
351
|
+
def run_on_resource(source, executable):
|
|
352
|
+
if hasattr(source, "selected_resources") and source.selected_resources:
|
|
353
|
+
resource_names = list(source.selected_resources.keys())
|
|
354
|
+
for res in resource_names:
|
|
355
|
+
executable(source.resources[res])
|
|
356
|
+
else:
|
|
357
|
+
executable(source)
|
|
358
|
+
|
|
351
359
|
track(
|
|
352
360
|
"command_triggered",
|
|
353
361
|
{
|
|
@@ -487,10 +495,20 @@ def ingest(
|
|
|
487
495
|
sql_exclude_columns=sql_exclude_columns,
|
|
488
496
|
)
|
|
489
497
|
|
|
490
|
-
dlt_source.add_map(cast_set_to_list)
|
|
498
|
+
run_on_resource(dlt_source, lambda x: x.add_map(cast_set_to_list))
|
|
499
|
+
|
|
500
|
+
def col_h(x):
|
|
501
|
+
if column_hints:
|
|
502
|
+
x.apply_hints(columns=column_hints)
|
|
503
|
+
|
|
504
|
+
run_on_resource(dlt_source, col_h)
|
|
491
505
|
|
|
492
506
|
if original_incremental_strategy == IncrementalStrategy.delete_insert:
|
|
493
|
-
|
|
507
|
+
|
|
508
|
+
def set_primary_key(x):
|
|
509
|
+
x.incremental.primary_key = ()
|
|
510
|
+
|
|
511
|
+
run_on_resource(dlt_source, set_primary_key)
|
|
494
512
|
|
|
495
513
|
if (
|
|
496
514
|
factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
|
|
@@ -522,7 +540,6 @@ def ingest(
|
|
|
522
540
|
loader_file_format=(
|
|
523
541
|
loader_file_format.value if loader_file_format is not None else None # type: ignore
|
|
524
542
|
), # type: ignore
|
|
525
|
-
columns=column_hints,
|
|
526
543
|
)
|
|
527
544
|
|
|
528
545
|
report_errors(run_info)
|
ingestr/src/adjust/__init__.py
CHANGED
|
@@ -82,7 +82,7 @@ def adjust_source(
|
|
|
82
82
|
type_hints[metric] = KNOWN_TYPE_HINTS[metric]
|
|
83
83
|
|
|
84
84
|
@dlt.resource(
|
|
85
|
-
write_disposition={"disposition": "merge", "strategy": "delete
|
|
85
|
+
write_disposition={"disposition": "merge", "strategy": "delete-insert"},
|
|
86
86
|
merge_key=merge_key,
|
|
87
87
|
primary_key=dimensions,
|
|
88
88
|
columns=type_hints,
|
ingestr/src/destinations.py
CHANGED
|
@@ -5,9 +5,10 @@ import json
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
7
|
import tempfile
|
|
8
|
-
from urllib.parse import parse_qs, urlparse
|
|
8
|
+
from urllib.parse import parse_qs, quote, urlparse
|
|
9
9
|
|
|
10
10
|
import dlt
|
|
11
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class GenericSqlDestination:
|
|
@@ -194,3 +195,62 @@ class CsvDestination(GenericSqlDestination):
|
|
|
194
195
|
csv_writer.writerow(json_obj)
|
|
195
196
|
|
|
196
197
|
shutil.rmtree(self.temp_path)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class AthenaDestination:
|
|
201
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
202
|
+
encoded_uri = quote(uri, safe=":/?&=")
|
|
203
|
+
source_fields = urlparse(encoded_uri)
|
|
204
|
+
source_params = parse_qs(source_fields.query)
|
|
205
|
+
|
|
206
|
+
bucket = source_params.get("bucket", [None])[0]
|
|
207
|
+
if not bucket:
|
|
208
|
+
raise ValueError("A bucket is required to connect to Athena.")
|
|
209
|
+
|
|
210
|
+
if not bucket.startswith("s3://"):
|
|
211
|
+
bucket = f"s3://{bucket}"
|
|
212
|
+
|
|
213
|
+
query_result_path = source_params.get("query_results_path", [None])[0]
|
|
214
|
+
if query_result_path:
|
|
215
|
+
if not query_result_path.startswith("s3://"):
|
|
216
|
+
query_result_path = f"s3://{query_result_path}"
|
|
217
|
+
else:
|
|
218
|
+
query_result_path = bucket
|
|
219
|
+
|
|
220
|
+
access_key_id = source_params.get("access_key_id", [None])[0]
|
|
221
|
+
if not access_key_id:
|
|
222
|
+
raise ValueError("The AWS access_key_id is required to connect to Athena.")
|
|
223
|
+
|
|
224
|
+
secret_access_key = source_params.get("secret_access_key", [None])[0]
|
|
225
|
+
if not secret_access_key:
|
|
226
|
+
raise ValueError("The AWS secret_access_key is required to connect Athena")
|
|
227
|
+
|
|
228
|
+
work_group = source_params.get("workgroup", [None])[0]
|
|
229
|
+
|
|
230
|
+
region_name = source_params.get("region_name", [None])[0]
|
|
231
|
+
if not region_name:
|
|
232
|
+
raise ValueError("The region_name is required to connect to Athena.")
|
|
233
|
+
|
|
234
|
+
os.environ["DESTINATION__BUCKET_URL"] = bucket
|
|
235
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
|
|
236
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
|
|
237
|
+
secret_access_key
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
credentials = AwsCredentials(
|
|
241
|
+
aws_access_key_id=access_key_id,
|
|
242
|
+
aws_secret_access_key=secret_access_key,
|
|
243
|
+
region_name=region_name,
|
|
244
|
+
)
|
|
245
|
+
return dlt.destinations.athena(
|
|
246
|
+
query_result_bucket=query_result_path,
|
|
247
|
+
athena_work_group=work_group,
|
|
248
|
+
credentials=credentials,
|
|
249
|
+
destination_name=bucket,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
253
|
+
return {}
|
|
254
|
+
|
|
255
|
+
def post_load(self):
|
|
256
|
+
pass
|
ingestr/src/factory.py
CHANGED
|
@@ -4,6 +4,7 @@ from urllib.parse import urlparse
|
|
|
4
4
|
from dlt.common.destination import Destination
|
|
5
5
|
|
|
6
6
|
from ingestr.src.destinations import (
|
|
7
|
+
AthenaDestination,
|
|
7
8
|
BigQueryDestination,
|
|
8
9
|
CsvDestination,
|
|
9
10
|
DatabricksDestination,
|
|
@@ -159,6 +160,7 @@ class SourceDestinationFactory:
|
|
|
159
160
|
"snowflake": SnowflakeDestination(),
|
|
160
161
|
"synapse": SynapseDestination(),
|
|
161
162
|
"csv": CsvDestination(),
|
|
163
|
+
"athena": AthenaDestination(),
|
|
162
164
|
}
|
|
163
165
|
|
|
164
166
|
if self.destination_scheme in match:
|
ingestr/src/filters.py
CHANGED
|
@@ -11,8 +11,6 @@ def cast_set_to_list(row):
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def table_adapter_exclude_columns(cols: list[str]):
|
|
14
|
-
print("given cols", cols)
|
|
15
|
-
|
|
16
14
|
def excluder(table: Table):
|
|
17
15
|
cols_to_remove = [col for col in table._columns if col.name in cols] # type: ignore
|
|
18
16
|
for col in cols_to_remove:
|
ingestr/src/sources.py
CHANGED
|
@@ -76,14 +76,17 @@ class SqlSource:
|
|
|
76
76
|
if kwargs.get("sql_limit"):
|
|
77
77
|
|
|
78
78
|
def query_adapter_callback(query, table):
|
|
79
|
-
|
|
79
|
+
query = query.limit(kwargs.get("sql_limit"))
|
|
80
|
+
if kwargs.get("incremental_key"):
|
|
81
|
+
query = query.order_by(kwargs.get("incremental_key"))
|
|
82
|
+
return query
|
|
80
83
|
|
|
81
84
|
def type_adapter_callback(sql_type):
|
|
82
85
|
if isinstance(sql_type, mysql.SET):
|
|
83
86
|
return sa.JSON
|
|
84
87
|
return sql_type
|
|
85
88
|
|
|
86
|
-
|
|
89
|
+
builder_res = self.table_builder(
|
|
87
90
|
credentials=ConnectionStringCredentials(uri),
|
|
88
91
|
schema=table_fields.dataset,
|
|
89
92
|
table=table_fields.table,
|
|
@@ -98,7 +101,7 @@ class SqlSource:
|
|
|
98
101
|
),
|
|
99
102
|
)
|
|
100
103
|
|
|
101
|
-
return
|
|
104
|
+
return builder_res
|
|
102
105
|
|
|
103
106
|
|
|
104
107
|
class ArrowMemoryMappedSource:
|
|
@@ -744,7 +747,7 @@ class KafkaSource:
|
|
|
744
747
|
|
|
745
748
|
class AdjustSource:
|
|
746
749
|
def handles_incrementality(self) -> bool:
|
|
747
|
-
return
|
|
750
|
+
return True
|
|
748
751
|
|
|
749
752
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
750
753
|
if kwargs.get("incremental_key") and not table.startswith("custom:"):
|
|
@@ -806,7 +809,7 @@ class AdjustSource:
|
|
|
806
809
|
filters_raw = fields[3]
|
|
807
810
|
filters = parse_filters(filters_raw)
|
|
808
811
|
|
|
809
|
-
|
|
812
|
+
src = adjust_source(
|
|
810
813
|
start_date=start_date,
|
|
811
814
|
end_date=end_date,
|
|
812
815
|
api_key=api_key[0],
|
|
@@ -814,7 +817,9 @@ class AdjustSource:
|
|
|
814
817
|
metrics=metrics,
|
|
815
818
|
merge_key=kwargs.get("merge_key"),
|
|
816
819
|
filters=filters,
|
|
817
|
-
)
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
return src.with_resources(table)
|
|
818
823
|
|
|
819
824
|
|
|
820
825
|
class AppsflyerSource:
|
ingestr/src/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.10.
|
|
1
|
+
__version__ = "0.10.2"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.2
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -26,13 +26,15 @@ Requires-Dist: pendulum==3.0.0
|
|
|
26
26
|
Requires-Dist: psycopg2-binary==2.9.10
|
|
27
27
|
Requires-Dist: py-machineid==0.6.0
|
|
28
28
|
Requires-Dist: pyairtable==2.3.3
|
|
29
|
+
Requires-Dist: pyarrow==18.1.0
|
|
30
|
+
Requires-Dist: pyathena==3.9.0
|
|
29
31
|
Requires-Dist: pymongo==4.10.1
|
|
30
32
|
Requires-Dist: pymysql==1.1.1
|
|
31
33
|
Requires-Dist: pyrate-limiter==3.7.0
|
|
32
34
|
Requires-Dist: redshift-connector==2.1.3
|
|
33
35
|
Requires-Dist: rich==13.9.4
|
|
34
36
|
Requires-Dist: rudder-sdk-python==2.1.4
|
|
35
|
-
Requires-Dist: s3fs==2024.
|
|
37
|
+
Requires-Dist: s3fs==2024.10.0
|
|
36
38
|
Requires-Dist: snowflake-sqlalchemy==1.6.1
|
|
37
39
|
Requires-Dist: sqlalchemy-bigquery==1.12.0
|
|
38
40
|
Requires-Dist: sqlalchemy-hana==2.0.0
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
ingestr/main.py,sha256=
|
|
1
|
+
ingestr/main.py,sha256=Uq0GTfCtlYu94Iw41AWhexZjgNdaEi7SqxkRAl0iTCg,22001
|
|
2
2
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
3
|
-
ingestr/src/destinations.py,sha256=
|
|
4
|
-
ingestr/src/factory.py,sha256=
|
|
5
|
-
ingestr/src/filters.py,sha256=
|
|
6
|
-
ingestr/src/sources.py,sha256=
|
|
3
|
+
ingestr/src/destinations.py,sha256=wT76Pi3JBbzfKj2goy4-L_XDPfjyPK6b95zyRxksr9g,8555
|
|
4
|
+
ingestr/src/factory.py,sha256=nYWgWQINQEQKPeELwGY7MCeiOSoCP6JDPozfKKyGNXk,5013
|
|
5
|
+
ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
|
|
6
|
+
ingestr/src/sources.py,sha256=KhY6AH91zZoSthi7AbFd4_OsrPmxP3Q4ratA7ZscsZU,34810
|
|
7
7
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
8
|
-
ingestr/src/version.py,sha256=
|
|
9
|
-
ingestr/src/adjust/__init__.py,sha256=
|
|
8
|
+
ingestr/src/version.py,sha256=A_AARqtxTOj_AQTpjpgOxNx-UOBio5wYFfZ2mrdMKfs,23
|
|
9
|
+
ingestr/src/adjust/__init__.py,sha256=I_G90D260OPIWCS716k0U4aeztlAieW9zi0R9-oW7TA,3007
|
|
10
10
|
ingestr/src/adjust/adjust_helpers.py,sha256=-tmmxy9k3wms-ZEIgxmlp2cAQ2X_O1lgjY1128bbMu4,3224
|
|
11
11
|
ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
|
|
12
12
|
ingestr/src/appsflyer/_init_.py,sha256=ne2-9FQ654Drtd3GkKQv8Bwb6LEqCnJw49MfO5Jyzgs,739
|
|
@@ -70,8 +70,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
|
|
|
70
70
|
ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
|
|
71
71
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
72
72
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
73
|
-
ingestr-0.10.
|
|
74
|
-
ingestr-0.10.
|
|
75
|
-
ingestr-0.10.
|
|
76
|
-
ingestr-0.10.
|
|
77
|
-
ingestr-0.10.
|
|
73
|
+
ingestr-0.10.2.dist-info/METADATA,sha256=3_ilZkg36lUCtkfUBRlI2LnVj4Vl5OKq_R8NhvKwWk4,7123
|
|
74
|
+
ingestr-0.10.2.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
|
75
|
+
ingestr-0.10.2.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
76
|
+
ingestr-0.10.2.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
77
|
+
ingestr-0.10.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|