ingestr 0.13.3__py3-none-any.whl → 0.13.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -7,12 +7,13 @@ import tempfile
7
7
  from urllib.parse import parse_qs, quote, urlparse
8
8
 
9
9
  import dlt
10
- import pyarrow.parquet # type: ignore
11
10
  from dlt.common.configuration.specs import AwsCredentials
12
11
  from dlt.destinations.impl.clickhouse.configuration import (
13
12
  ClickHouseCredentials,
14
13
  )
15
14
 
15
+ from ingestr.src.loader import load_dlt_file
16
+
16
17
 
17
18
  class GenericSqlDestination:
18
19
  def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
@@ -59,9 +60,14 @@ class BigQueryDestination:
59
60
  base64.b64decode(credentials_base64[0]).decode("utf-8")
60
61
  )
61
62
 
63
+ project_id = None
64
+ if source_fields.hostname:
65
+ project_id = source_fields.hostname
66
+
62
67
  return dlt.destinations.bigquery(
63
68
  credentials=credentials, # type: ignore
64
69
  location=location,
70
+ project_id=project_id,
65
71
  **kwargs,
66
72
  )
67
73
 
@@ -184,11 +190,9 @@ class CsvDestination(GenericSqlDestination):
184
190
  if output_path.count("/") > 1:
185
191
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
186
192
 
187
- table = pyarrow.parquet.read_table(first_file_path)
188
- rows = table.to_pylist()
189
193
  with open(output_path, "w", newline="") as csv_file:
190
194
  csv_writer = None
191
- for row in rows:
195
+ for row in load_dlt_file(first_file_path):
192
196
  row = filter_keys(row)
193
197
  if csv_writer is None:
194
198
  csv_writer = csv.DictWriter(csv_file, fieldnames=row.keys())
ingestr/src/factory.py CHANGED
@@ -66,6 +66,7 @@ SQL_SOURCE_SCHEMES = [
66
66
  "oracle",
67
67
  "oracle+cx_oracle",
68
68
  "hana",
69
+ "clickhouse",
69
70
  ]
70
71
 
71
72
 
ingestr/src/loader.py ADDED
@@ -0,0 +1,69 @@
1
+ import csv
2
+ import gzip
3
+ import json
4
+ import subprocess
5
+ from contextlib import contextmanager
6
+ from typing import Generator
7
+
8
+ from pyarrow.parquet import ParquetFile # type: ignore
9
+
10
+ PARQUET_BATCH_SIZE = 64
11
+
12
+
13
+ class UnsupportedLoaderFileFormat(Exception):
14
+ pass
15
+
16
+
17
+ def load_dlt_file(filepath: str) -> Generator:
18
+ """
19
+ load_dlt_file reads dlt loader files. It handles different loader file formats
20
+ automatically. It returns a generator that yield data items as a python dict
21
+ """
22
+ result = subprocess.run(
23
+ ["file", "-b", filepath],
24
+ check=True,
25
+ capture_output=True,
26
+ text=True,
27
+ )
28
+
29
+ filetype = result.stdout.strip()
30
+ with factory(filetype, filepath) as reader:
31
+ yield from reader
32
+
33
+
34
+ def factory(filetype: str, filepath: str):
35
+ # ???(turtledev): can dlt produce non-gizpped jsonl files?
36
+ if filetype.startswith("gzip"):
37
+ return jsonlfile(filepath)
38
+ elif filetype.startswith("CSV"):
39
+ return csvfile(filepath)
40
+ elif filetype.startswith("Apache Parquet"):
41
+ return parquetfile(filepath)
42
+ else:
43
+ raise UnsupportedLoaderFileFormat(filetype)
44
+
45
+
46
+ @contextmanager
47
+ def jsonlfile(filepath: str):
48
+ def reader(fd):
49
+ for line in fd:
50
+ yield json.loads(line.decode().strip())
51
+
52
+ with gzip.open(filepath) as fd:
53
+ yield reader(fd)
54
+
55
+
56
+ @contextmanager
57
+ def csvfile(filepath: str):
58
+ with open(filepath, "r") as fd:
59
+ yield csv.DictReader(fd)
60
+
61
+
62
+ @contextmanager
63
+ def parquetfile(filepath: str):
64
+ def reader(pf: ParquetFile):
65
+ for batch in pf.iter_batches(PARQUET_BATCH_SIZE):
66
+ yield from batch.to_pylist()
67
+
68
+ with open(filepath, "rb") as fd:
69
+ yield reader(ParquetFile(fd))
ingestr/src/sources.py CHANGED
@@ -133,6 +133,11 @@ class SqlSource:
133
133
  if uri.startswith("mysql://"):
134
134
  uri = uri.replace("mysql://", "mysql+pymysql://")
135
135
 
136
+ if uri.startswith("clickhouse://"):
137
+ uri = uri.replace("clickhouse://", "clickhouse+native://")
138
+ if "secure=" not in uri:
139
+ uri += "?secure=1"
140
+
136
141
  query_adapters = []
137
142
  if kwargs.get("sql_limit"):
138
143
  query_adapters.append(
ingestr/src/version.py CHANGED
@@ -1 +1,6 @@
1
- __version__ = "0.13.3"
1
+ try:
2
+ from ingestr.src import buildinfo # type: ignore
3
+
4
+ __version__ = buildinfo.version.lstrip("v")
5
+ except ImportError:
6
+ __version__ = "0.0.0-dev"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.3
3
+ Version: 0.13.5
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -132,7 +132,6 @@ Pull requests are welcome. However, please open an issue first to discuss what y
132
132
  > After cloning `ingestr` make sure to run `make setup` to install githooks.
133
133
 
134
134
  ## Supported sources & destinations
135
-
136
135
  <table>
137
136
  <tr>
138
137
  <th></th>
@@ -142,33 +141,28 @@ Pull requests are welcome. However, please open an issue first to discuss what y
142
141
  <tr>
143
142
  <td colspan="3" style='text-align:center;'><strong>Databases</strong></td>
144
143
  </tr>
145
- <tr>
146
- <td>Postgres</td>
147
- <td>✅</td>
148
- <td>✅</td>
149
- </tr>
150
144
  <tr>
151
145
  <td>BigQuery</td>
152
146
  <td>✅</td>
153
147
  <td>✅</td>
154
148
  </tr>
155
149
  <tr>
156
- <td>Snowflake</td>
157
- <td>✅</td>
150
+ <td>ClickHouse</td>
151
+ <td>❌</td>
158
152
  <td>✅</td>
159
153
  </tr>
160
154
  <tr>
161
- <td>Redshift</td>
155
+ <td>Databricks</td>
162
156
  <td>✅</td>
163
157
  <td>✅</td>
164
158
  </tr>
165
159
  <tr>
166
- <td>Databricks</td>
160
+ <td>DuckDB</td>
167
161
  <td>✅</td>
168
162
  <td>✅</td>
169
163
  </tr>
170
164
  <tr>
171
- <td>DuckDB</td>
165
+ <td>Local CSV file</td>
172
166
  <td>✅</td>
173
167
  <td>✅</td>
174
168
  </tr>
@@ -178,12 +172,12 @@ Pull requests are welcome. However, please open an issue first to discuss what y
178
172
  <td>✅</td>
179
173
  </tr>
180
174
  <tr>
181
- <td>Local CSV file</td>
182
- <td>✅</td>
175
+ <td>MongoDB</td>
183
176
  <td>✅</td>
177
+ <td>❌</td>
184
178
  </tr>
185
179
  <tr>
186
- <td>MongoDB</td>
180
+ <td>MySQL</td>
187
181
  <td>✅</td>
188
182
  <td>❌</td>
189
183
  </tr>
@@ -192,18 +186,28 @@ Pull requests are welcome. However, please open an issue first to discuss what y
192
186
  <td>✅</td>
193
187
  <td>❌</td>
194
188
  </tr>
189
+ <tr>
190
+ <td>Postgres</td>
191
+ <td>✅</td>
192
+ <td>✅</td>
193
+ </tr>
194
+ <tr>
195
+ <td>Redshift</td>
196
+ <td>✅</td>
197
+ <td>✅</td>
198
+ </tr>
195
199
  <tr>
196
200
  <td>SAP Hana</td>
197
201
  <td>✅</td>
198
202
  <td>❌</td>
199
203
  </tr>
200
204
  <tr>
201
- <td>SQLite</td>
205
+ <td>Snowflake</td>
206
+ <td>✅</td>
202
207
  <td>✅</td>
203
- <td>❌</td>
204
208
  </tr>
205
209
  <tr>
206
- <td>MySQL</td>
210
+ <td>SQLite</td>
207
211
  <td>✅</td>
208
212
  <td>❌</td>
209
213
  </tr>
@@ -218,11 +222,21 @@ Pull requests are welcome. However, please open an issue first to discuss what y
218
222
  <td>✅</td>
219
223
  <td>-</td>
220
224
  </tr>
225
+ <tr>
226
+ <td>Apache Kafka</td>
227
+ <td>✅</td>
228
+ <td>-</td>
229
+ </tr>
221
230
  <tr>
222
231
  <td>AppsFlyer</td>
223
232
  <td>✅</td>
224
233
  <td>-</td>
225
234
  </tr>
235
+ <tr>
236
+ <td>App Store</td>
237
+ <td>✅</td>
238
+ <td>-</td>
239
+ </tr>
226
240
  <tr>
227
241
  <td>Asana</td>
228
242
  <td>✅</td>
@@ -243,6 +257,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
243
257
  <td>✅</td>
244
258
  <td>-</td>
245
259
  </tr>
260
+ <tr>
261
+ <td>Github</td>
262
+ <td>✅</td>
263
+ <td>-</td>
264
+ </tr>
246
265
  <tr>
247
266
  <td>Gorgias</td>
248
267
  <td>✅</td>
@@ -254,7 +273,17 @@ Pull requests are welcome. However, please open an issue first to discuss what y
254
273
  <td>-</td>
255
274
  </tr>
256
275
  <tr>
257
- <td>HubSpot</td>
276
+ <td>Google Ads</td>
277
+ <td>✅</td>
278
+ <td>-</td>
279
+ </tr>
280
+ <tr>
281
+ <td>Google Analytics</td>
282
+ <td>✅</td>
283
+ <td>-</td>
284
+ </tr>
285
+ <tr>
286
+ <td>Intercom</td>
258
287
  <td>✅</td>
259
288
  <td>-</td>
260
289
  </tr>
@@ -263,6 +292,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
263
292
  <td>✅</td>
264
293
  <td>-</td>
265
294
  </tr>
295
+ <tr>
296
+ <td>LinkedIn Ads</td>
297
+ <td>✅</td>
298
+ <td>-</td>
299
+ </tr>
266
300
  <tr>
267
301
  <td>Notion</td>
268
302
  <td>✅</td>
@@ -288,6 +322,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
288
322
  <td>✅</td>
289
323
  <td>-</td>
290
324
  </tr>
325
+ <tr>
326
+ <td>TikTok Ads</td>
327
+ <td>✅</td>
328
+ <td>-</td>
329
+ </tr>
291
330
  <tr>
292
331
  <td>Zendesk</td>
293
332
  <td>✅</td>
@@ -1,14 +1,15 @@
1
1
  ingestr/main.py,sha256=ufn8AcM2ID80ChUApJzYDjnQaurMXOkYfTm6GzAggSQ,24746
2
2
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
3
3
  ingestr/src/blob.py,sha256=XDk_XqmU_He4sQ1brY3ceoZgpq_ZBZihz1gHW9MzqUk,1381
4
- ingestr/src/destinations.py,sha256=WxerdCx0gS4JveYAE-GzdJLbgP4t2QXXQhUoU3GvmLM,11194
4
+ ingestr/src/destinations.py,sha256=vrGij4qMPCdXTMIimROWBJFqzOqCM4DFmgyubgSHejA,11279
5
5
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
6
- ingestr/src/factory.py,sha256=b1Fg3lhTu6HoALi4AFrTOm3fbNPj5EB2lPigrjiY1so,4926
6
+ ingestr/src/factory.py,sha256=CG_CZox_vJOYfmlCJ1FH9Ipb5LDeESCzox5ZOAO-wjs,4944
7
7
  ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
8
- ingestr/src/sources.py,sha256=YkBCyjI8DWovyA8CI68zOlHT4UKPsG8-UTCyuafI_WA,63075
8
+ ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
9
+ ingestr/src/sources.py,sha256=AxtWtqhn1OOFAYypLph2FhmD3CD7CDVBRvOIKb6mDIg,63263
9
10
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
10
11
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
11
- ingestr/src/version.py,sha256=aiCDTKDs80gZjvqiXYkxhq_MLi4Du0L9OByDr6ZEVV4,23
12
+ ingestr/src/version.py,sha256=l6zVm0GMMwnBlIOONWc6snhko9d8-HO1y6Jj1T1vsiQ,158
12
13
  ingestr/src/adjust/__init__.py,sha256=ULjtJqrNS6XDvUyGl0tjl12-tLyXlCgeFe2icTbtu3Q,3255
13
14
  ingestr/src/adjust/adjust_helpers.py,sha256=av97NPSn-hQtTbAC0vUSCAWYePmOiG5R-DGdMssm7FQ,3646
14
15
  ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
@@ -101,8 +102,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
101
102
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
102
103
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
103
104
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
104
- ingestr-0.13.3.dist-info/METADATA,sha256=mVT47j7eOqecG3fnDHEkWzqmM4QHWDUixAWKpkGsoEk,8252
105
- ingestr-0.13.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
106
- ingestr-0.13.3.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
107
- ingestr-0.13.3.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
108
- ingestr-0.13.3.dist-info/RECORD,,
105
+ ingestr-0.13.5.dist-info/METADATA,sha256=5kk8N8xrnWAJvOL4n2Mh1Xv7sXOn6iJyybHVBEGNRcU,8956
106
+ ingestr-0.13.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
107
+ ingestr-0.13.5.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
108
+ ingestr-0.13.5.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
109
+ ingestr-0.13.5.dist-info/RECORD,,