ingestr 0.13.3__py3-none-any.whl → 0.13.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/src/destinations.py +8 -4
- ingestr/src/factory.py +1 -0
- ingestr/src/loader.py +69 -0
- ingestr/src/sources.py +5 -0
- ingestr/src/version.py +6 -1
- {ingestr-0.13.3.dist-info → ingestr-0.13.5.dist-info}/METADATA +58 -19
- {ingestr-0.13.3.dist-info → ingestr-0.13.5.dist-info}/RECORD +10 -9
- {ingestr-0.13.3.dist-info → ingestr-0.13.5.dist-info}/WHEEL +0 -0
- {ingestr-0.13.3.dist-info → ingestr-0.13.5.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.3.dist-info → ingestr-0.13.5.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/destinations.py
CHANGED
|
@@ -7,12 +7,13 @@ import tempfile
|
|
|
7
7
|
from urllib.parse import parse_qs, quote, urlparse
|
|
8
8
|
|
|
9
9
|
import dlt
|
|
10
|
-
import pyarrow.parquet # type: ignore
|
|
11
10
|
from dlt.common.configuration.specs import AwsCredentials
|
|
12
11
|
from dlt.destinations.impl.clickhouse.configuration import (
|
|
13
12
|
ClickHouseCredentials,
|
|
14
13
|
)
|
|
15
14
|
|
|
15
|
+
from ingestr.src.loader import load_dlt_file
|
|
16
|
+
|
|
16
17
|
|
|
17
18
|
class GenericSqlDestination:
|
|
18
19
|
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
@@ -59,9 +60,14 @@ class BigQueryDestination:
|
|
|
59
60
|
base64.b64decode(credentials_base64[0]).decode("utf-8")
|
|
60
61
|
)
|
|
61
62
|
|
|
63
|
+
project_id = None
|
|
64
|
+
if source_fields.hostname:
|
|
65
|
+
project_id = source_fields.hostname
|
|
66
|
+
|
|
62
67
|
return dlt.destinations.bigquery(
|
|
63
68
|
credentials=credentials, # type: ignore
|
|
64
69
|
location=location,
|
|
70
|
+
project_id=project_id,
|
|
65
71
|
**kwargs,
|
|
66
72
|
)
|
|
67
73
|
|
|
@@ -184,11 +190,9 @@ class CsvDestination(GenericSqlDestination):
|
|
|
184
190
|
if output_path.count("/") > 1:
|
|
185
191
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
186
192
|
|
|
187
|
-
table = pyarrow.parquet.read_table(first_file_path)
|
|
188
|
-
rows = table.to_pylist()
|
|
189
193
|
with open(output_path, "w", newline="") as csv_file:
|
|
190
194
|
csv_writer = None
|
|
191
|
-
for row in
|
|
195
|
+
for row in load_dlt_file(first_file_path):
|
|
192
196
|
row = filter_keys(row)
|
|
193
197
|
if csv_writer is None:
|
|
194
198
|
csv_writer = csv.DictWriter(csv_file, fieldnames=row.keys())
|
ingestr/src/factory.py
CHANGED
ingestr/src/loader.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import gzip
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from typing import Generator
|
|
7
|
+
|
|
8
|
+
from pyarrow.parquet import ParquetFile # type: ignore
|
|
9
|
+
|
|
10
|
+
PARQUET_BATCH_SIZE = 64
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UnsupportedLoaderFileFormat(Exception):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_dlt_file(filepath: str) -> Generator:
|
|
18
|
+
"""
|
|
19
|
+
load_dlt_file reads dlt loader files. It handles different loader file formats
|
|
20
|
+
automatically. It returns a generator that yield data items as a python dict
|
|
21
|
+
"""
|
|
22
|
+
result = subprocess.run(
|
|
23
|
+
["file", "-b", filepath],
|
|
24
|
+
check=True,
|
|
25
|
+
capture_output=True,
|
|
26
|
+
text=True,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
filetype = result.stdout.strip()
|
|
30
|
+
with factory(filetype, filepath) as reader:
|
|
31
|
+
yield from reader
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def factory(filetype: str, filepath: str):
|
|
35
|
+
# ???(turtledev): can dlt produce non-gizpped jsonl files?
|
|
36
|
+
if filetype.startswith("gzip"):
|
|
37
|
+
return jsonlfile(filepath)
|
|
38
|
+
elif filetype.startswith("CSV"):
|
|
39
|
+
return csvfile(filepath)
|
|
40
|
+
elif filetype.startswith("Apache Parquet"):
|
|
41
|
+
return parquetfile(filepath)
|
|
42
|
+
else:
|
|
43
|
+
raise UnsupportedLoaderFileFormat(filetype)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@contextmanager
|
|
47
|
+
def jsonlfile(filepath: str):
|
|
48
|
+
def reader(fd):
|
|
49
|
+
for line in fd:
|
|
50
|
+
yield json.loads(line.decode().strip())
|
|
51
|
+
|
|
52
|
+
with gzip.open(filepath) as fd:
|
|
53
|
+
yield reader(fd)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@contextmanager
|
|
57
|
+
def csvfile(filepath: str):
|
|
58
|
+
with open(filepath, "r") as fd:
|
|
59
|
+
yield csv.DictReader(fd)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@contextmanager
|
|
63
|
+
def parquetfile(filepath: str):
|
|
64
|
+
def reader(pf: ParquetFile):
|
|
65
|
+
for batch in pf.iter_batches(PARQUET_BATCH_SIZE):
|
|
66
|
+
yield from batch.to_pylist()
|
|
67
|
+
|
|
68
|
+
with open(filepath, "rb") as fd:
|
|
69
|
+
yield reader(ParquetFile(fd))
|
ingestr/src/sources.py
CHANGED
|
@@ -133,6 +133,11 @@ class SqlSource:
|
|
|
133
133
|
if uri.startswith("mysql://"):
|
|
134
134
|
uri = uri.replace("mysql://", "mysql+pymysql://")
|
|
135
135
|
|
|
136
|
+
if uri.startswith("clickhouse://"):
|
|
137
|
+
uri = uri.replace("clickhouse://", "clickhouse+native://")
|
|
138
|
+
if "secure=" not in uri:
|
|
139
|
+
uri += "?secure=1"
|
|
140
|
+
|
|
136
141
|
query_adapters = []
|
|
137
142
|
if kwargs.get("sql_limit"):
|
|
138
143
|
query_adapters.append(
|
ingestr/src/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.5
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -132,7 +132,6 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
132
132
|
> After cloning `ingestr` make sure to run `make setup` to install githooks.
|
|
133
133
|
|
|
134
134
|
## Supported sources & destinations
|
|
135
|
-
|
|
136
135
|
<table>
|
|
137
136
|
<tr>
|
|
138
137
|
<th></th>
|
|
@@ -142,33 +141,28 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
142
141
|
<tr>
|
|
143
142
|
<td colspan="3" style='text-align:center;'><strong>Databases</strong></td>
|
|
144
143
|
</tr>
|
|
145
|
-
<tr>
|
|
146
|
-
<td>Postgres</td>
|
|
147
|
-
<td>✅</td>
|
|
148
|
-
<td>✅</td>
|
|
149
|
-
</tr>
|
|
150
144
|
<tr>
|
|
151
145
|
<td>BigQuery</td>
|
|
152
146
|
<td>✅</td>
|
|
153
147
|
<td>✅</td>
|
|
154
148
|
</tr>
|
|
155
149
|
<tr>
|
|
156
|
-
<td>
|
|
157
|
-
<td
|
|
150
|
+
<td>ClickHouse</td>
|
|
151
|
+
<td>❌</td>
|
|
158
152
|
<td>✅</td>
|
|
159
153
|
</tr>
|
|
160
154
|
<tr>
|
|
161
|
-
<td>
|
|
155
|
+
<td>Databricks</td>
|
|
162
156
|
<td>✅</td>
|
|
163
157
|
<td>✅</td>
|
|
164
158
|
</tr>
|
|
165
159
|
<tr>
|
|
166
|
-
<td>
|
|
160
|
+
<td>DuckDB</td>
|
|
167
161
|
<td>✅</td>
|
|
168
162
|
<td>✅</td>
|
|
169
163
|
</tr>
|
|
170
164
|
<tr>
|
|
171
|
-
<td>
|
|
165
|
+
<td>Local CSV file</td>
|
|
172
166
|
<td>✅</td>
|
|
173
167
|
<td>✅</td>
|
|
174
168
|
</tr>
|
|
@@ -178,12 +172,12 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
178
172
|
<td>✅</td>
|
|
179
173
|
</tr>
|
|
180
174
|
<tr>
|
|
181
|
-
<td>
|
|
182
|
-
<td>✅</td>
|
|
175
|
+
<td>MongoDB</td>
|
|
183
176
|
<td>✅</td>
|
|
177
|
+
<td>❌</td>
|
|
184
178
|
</tr>
|
|
185
179
|
<tr>
|
|
186
|
-
<td>
|
|
180
|
+
<td>MySQL</td>
|
|
187
181
|
<td>✅</td>
|
|
188
182
|
<td>❌</td>
|
|
189
183
|
</tr>
|
|
@@ -192,18 +186,28 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
192
186
|
<td>✅</td>
|
|
193
187
|
<td>❌</td>
|
|
194
188
|
</tr>
|
|
189
|
+
<tr>
|
|
190
|
+
<td>Postgres</td>
|
|
191
|
+
<td>✅</td>
|
|
192
|
+
<td>✅</td>
|
|
193
|
+
</tr>
|
|
194
|
+
<tr>
|
|
195
|
+
<td>Redshift</td>
|
|
196
|
+
<td>✅</td>
|
|
197
|
+
<td>✅</td>
|
|
198
|
+
</tr>
|
|
195
199
|
<tr>
|
|
196
200
|
<td>SAP Hana</td>
|
|
197
201
|
<td>✅</td>
|
|
198
202
|
<td>❌</td>
|
|
199
203
|
</tr>
|
|
200
204
|
<tr>
|
|
201
|
-
<td>
|
|
205
|
+
<td>Snowflake</td>
|
|
206
|
+
<td>✅</td>
|
|
202
207
|
<td>✅</td>
|
|
203
|
-
<td>❌</td>
|
|
204
208
|
</tr>
|
|
205
209
|
<tr>
|
|
206
|
-
<td>
|
|
210
|
+
<td>SQLite</td>
|
|
207
211
|
<td>✅</td>
|
|
208
212
|
<td>❌</td>
|
|
209
213
|
</tr>
|
|
@@ -218,11 +222,21 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
218
222
|
<td>✅</td>
|
|
219
223
|
<td>-</td>
|
|
220
224
|
</tr>
|
|
225
|
+
<tr>
|
|
226
|
+
<td>Apache Kafka</td>
|
|
227
|
+
<td>✅</td>
|
|
228
|
+
<td>-</td>
|
|
229
|
+
</tr>
|
|
221
230
|
<tr>
|
|
222
231
|
<td>AppsFlyer</td>
|
|
223
232
|
<td>✅</td>
|
|
224
233
|
<td>-</td>
|
|
225
234
|
</tr>
|
|
235
|
+
<tr>
|
|
236
|
+
<td>App Store</td>
|
|
237
|
+
<td>✅</td>
|
|
238
|
+
<td>-</td>
|
|
239
|
+
</tr>
|
|
226
240
|
<tr>
|
|
227
241
|
<td>Asana</td>
|
|
228
242
|
<td>✅</td>
|
|
@@ -243,6 +257,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
243
257
|
<td>✅</td>
|
|
244
258
|
<td>-</td>
|
|
245
259
|
</tr>
|
|
260
|
+
<tr>
|
|
261
|
+
<td>Github</td>
|
|
262
|
+
<td>✅</td>
|
|
263
|
+
<td>-</td>
|
|
264
|
+
</tr>
|
|
246
265
|
<tr>
|
|
247
266
|
<td>Gorgias</td>
|
|
248
267
|
<td>✅</td>
|
|
@@ -254,7 +273,17 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
254
273
|
<td>-</td>
|
|
255
274
|
</tr>
|
|
256
275
|
<tr>
|
|
257
|
-
<td>
|
|
276
|
+
<td>Google Ads</td>
|
|
277
|
+
<td>✅</td>
|
|
278
|
+
<td>-</td>
|
|
279
|
+
</tr>
|
|
280
|
+
<tr>
|
|
281
|
+
<td>Google Analytics</td>
|
|
282
|
+
<td>✅</td>
|
|
283
|
+
<td>-</td>
|
|
284
|
+
</tr>
|
|
285
|
+
<tr>
|
|
286
|
+
<td>Intercom</td>
|
|
258
287
|
<td>✅</td>
|
|
259
288
|
<td>-</td>
|
|
260
289
|
</tr>
|
|
@@ -263,6 +292,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
263
292
|
<td>✅</td>
|
|
264
293
|
<td>-</td>
|
|
265
294
|
</tr>
|
|
295
|
+
<tr>
|
|
296
|
+
<td>LinkedIn Ads</td>
|
|
297
|
+
<td>✅</td>
|
|
298
|
+
<td>-</td>
|
|
299
|
+
</tr>
|
|
266
300
|
<tr>
|
|
267
301
|
<td>Notion</td>
|
|
268
302
|
<td>✅</td>
|
|
@@ -288,6 +322,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
288
322
|
<td>✅</td>
|
|
289
323
|
<td>-</td>
|
|
290
324
|
</tr>
|
|
325
|
+
<tr>
|
|
326
|
+
<td>TikTok Ads</td>
|
|
327
|
+
<td>✅</td>
|
|
328
|
+
<td>-</td>
|
|
329
|
+
</tr>
|
|
291
330
|
<tr>
|
|
292
331
|
<td>Zendesk</td>
|
|
293
332
|
<td>✅</td>
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
ingestr/main.py,sha256=ufn8AcM2ID80ChUApJzYDjnQaurMXOkYfTm6GzAggSQ,24746
|
|
2
2
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
3
3
|
ingestr/src/blob.py,sha256=XDk_XqmU_He4sQ1brY3ceoZgpq_ZBZihz1gHW9MzqUk,1381
|
|
4
|
-
ingestr/src/destinations.py,sha256=
|
|
4
|
+
ingestr/src/destinations.py,sha256=vrGij4qMPCdXTMIimROWBJFqzOqCM4DFmgyubgSHejA,11279
|
|
5
5
|
ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
|
|
6
|
-
ingestr/src/factory.py,sha256=
|
|
6
|
+
ingestr/src/factory.py,sha256=CG_CZox_vJOYfmlCJ1FH9Ipb5LDeESCzox5ZOAO-wjs,4944
|
|
7
7
|
ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
|
|
8
|
-
ingestr/src/
|
|
8
|
+
ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
|
|
9
|
+
ingestr/src/sources.py,sha256=AxtWtqhn1OOFAYypLph2FhmD3CD7CDVBRvOIKb6mDIg,63263
|
|
9
10
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
10
11
|
ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
|
|
11
|
-
ingestr/src/version.py,sha256=
|
|
12
|
+
ingestr/src/version.py,sha256=l6zVm0GMMwnBlIOONWc6snhko9d8-HO1y6Jj1T1vsiQ,158
|
|
12
13
|
ingestr/src/adjust/__init__.py,sha256=ULjtJqrNS6XDvUyGl0tjl12-tLyXlCgeFe2icTbtu3Q,3255
|
|
13
14
|
ingestr/src/adjust/adjust_helpers.py,sha256=av97NPSn-hQtTbAC0vUSCAWYePmOiG5R-DGdMssm7FQ,3646
|
|
14
15
|
ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
|
|
@@ -101,8 +102,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
|
|
|
101
102
|
ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
|
|
102
103
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
103
104
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
104
|
-
ingestr-0.13.
|
|
105
|
-
ingestr-0.13.
|
|
106
|
-
ingestr-0.13.
|
|
107
|
-
ingestr-0.13.
|
|
108
|
-
ingestr-0.13.
|
|
105
|
+
ingestr-0.13.5.dist-info/METADATA,sha256=5kk8N8xrnWAJvOL4n2Mh1Xv7sXOn6iJyybHVBEGNRcU,8956
|
|
106
|
+
ingestr-0.13.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
107
|
+
ingestr-0.13.5.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
108
|
+
ingestr-0.13.5.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
109
|
+
ingestr-0.13.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|