ingestr 0.13.22__py3-none-any.whl → 0.13.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/conftest.py ADDED
@@ -0,0 +1,63 @@
1
+ import os
2
+ import tempfile
3
+ from concurrent.futures import ThreadPoolExecutor
4
+
5
+ from main_test import DESTINATIONS, SOURCES # type: ignore
6
+
7
+
8
+ def pytest_configure(config):
9
+ if is_master(config):
10
+ config.shared_directory = tempfile.mkdtemp()
11
+
12
+
13
+ def pytest_configure_node(node):
14
+ """xdist hook"""
15
+ node.workerinput["shared_directory"] = node.config.shared_directory
16
+
17
+
18
+ def is_master(config):
19
+ """True if the code running the given pytest.config object is running in a xdist master
20
+ node or not running xdist at all.
21
+ """
22
+ return not hasattr(config, "workerinput")
23
+
24
+
25
+ def start_containers(config):
26
+ if hasattr(config, "workerinput"):
27
+ return
28
+
29
+ unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
30
+ for container in unique_containers:
31
+ container.container_lock_dir = config.shared_directory
32
+
33
+ with ThreadPoolExecutor() as executor:
34
+ for container in unique_containers:
35
+ executor.submit(container.start_fully)
36
+ # futures = [
37
+ # executor.submit(container.start_fully) for container in unique_containers
38
+ # ]
39
+ # # Wait for all futures to complete
40
+ # for future in futures:
41
+ # future.result()
42
+
43
+
44
+ def stop_containers(config):
45
+ if hasattr(config, "workerinput"):
46
+ return
47
+
48
+ should_manage_containers = os.environ.get("PYTEST_XDIST_WORKER", "gw0") == "gw0"
49
+ if not should_manage_containers:
50
+ return
51
+
52
+ unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
53
+
54
+ for container in unique_containers:
55
+ container.stop_fully()
56
+
57
+
58
+ def pytest_sessionstart(session):
59
+ start_containers(session.config)
60
+
61
+
62
+ def pytest_sessionfinish(session, exitstatus):
63
+ stop_containers(session.config)
ingestr/main.py CHANGED
@@ -11,7 +11,7 @@ from typing_extensions import Annotated
11
11
  import ingestr.src.partition as partition
12
12
  import ingestr.src.resource as resource
13
13
  from ingestr.src.destinations import AthenaDestination
14
- from ingestr.src.filters import cast_set_to_list
14
+ from ingestr.src.filters import cast_set_to_list, handle_mysql_empty_dates
15
15
  from ingestr.src.telemetry.event import track
16
16
 
17
17
  app = typer.Typer(
@@ -35,7 +35,7 @@ DATE_FORMATS = [
35
35
 
36
36
  # https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
37
37
  PARQUET_SUPPORTED_DESTINATIONS = [
38
- "athena" "bigquery",
38
+ "athenabigquery",
39
39
  "duckdb",
40
40
  "snowflake",
41
41
  "databricks",
@@ -553,6 +553,8 @@ def ingest(
553
553
  )
554
554
 
555
555
  resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
556
+ if factory.source_scheme.startswith("mysql"):
557
+ resource.for_each(dlt_source, lambda x: x.add_map(handle_mysql_empty_dates))
556
558
 
557
559
  def col_h(x):
558
560
  if column_hints:
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.22"
1
+ version = "v0.13.24"
ingestr/src/filters.py CHANGED
@@ -10,6 +10,27 @@ def cast_set_to_list(row):
10
10
  return row
11
11
 
12
12
 
13
+ def handle_mysql_empty_dates(row):
14
+ # MySQL returns empty dates as 0000-00-00, which is not a valid date, we handle them here.
15
+ if not isinstance(row, dict):
16
+ return row
17
+
18
+ for key in row.keys():
19
+ if not isinstance(row[key], str):
20
+ continue
21
+
22
+ if row[key] == "0000-00-00":
23
+ from datetime import date
24
+
25
+ row[key] = date(1970, 1, 1)
26
+
27
+ elif row[key] == "0000-00-00 00:00:00":
28
+ from datetime import datetime
29
+
30
+ row[key] = datetime(1970, 1, 1, 0, 0, 0)
31
+ return row
32
+
33
+
13
34
  def table_adapter_exclude_columns(cols: list[str]):
14
35
  def excluder(table: Table):
15
36
  cols_to_remove = [col for col in table._columns if col.name in cols] # type: ignore
@@ -103,9 +103,9 @@ def get_reactions_data(
103
103
 
104
104
 
105
105
  def _extract_top_connection(data: StrAny, node_type: str) -> StrAny:
106
- assert (
107
- isinstance(data, dict) and len(data) == 1
108
- ), f"The data with list of {node_type} must be a dictionary and contain only one element"
106
+ assert isinstance(data, dict) and len(data) == 1, (
107
+ f"The data with list of {node_type} must be a dictionary and contain only one element"
108
+ )
109
109
  data = next(iter(data.values()))
110
110
  return data[node_type] # type: ignore
111
111
 
@@ -158,7 +158,7 @@ def _get_graphql_pages(
158
158
  )
159
159
  items_count += len(data_items)
160
160
  print(
161
- f'Got {len(data_items)}/{items_count} {node_type}s, query cost {rate_limit["cost"]}, remaining credits: {rate_limit["remaining"]}'
161
+ f"Got {len(data_items)}/{items_count} {node_type}s, query cost {rate_limit['cost']}, remaining credits: {rate_limit['remaining']}"
162
162
  )
163
163
  if data_items:
164
164
  yield data_items
@@ -187,7 +187,7 @@ def _get_comment_reaction(comment_ids: List[str], access_token: str) -> StrAny:
187
187
  # print(query)
188
188
  page, rate_limit = _run_graphql_query(access_token, query, {})
189
189
  print(
190
- f'Got {len(page)} comments, query cost {rate_limit["cost"]}, remaining credits: {rate_limit["remaining"]}'
190
+ f"Got {len(page)} comments, query cost {rate_limit['cost']}, remaining credits: {rate_limit['remaining']}"
191
191
  )
192
192
  data.update(page)
193
193
  return data
@@ -70,9 +70,9 @@ def google_spreadsheet(
70
70
  spreadsheet_id=spreadsheet_id,
71
71
  range_names=list(all_range_names),
72
72
  )
73
- assert len(all_range_names) == len(
74
- all_range_data
75
- ), "Google Sheets API must return values for all requested ranges"
73
+ assert len(all_range_names) == len(all_range_data), (
74
+ "Google Sheets API must return values for all requested ranges"
75
+ )
76
76
 
77
77
  # get metadata for two first rows of each range
78
78
  # first should contain headers
@@ -126,7 +126,7 @@ def google_spreadsheet(
126
126
  headers = get_range_headers(headers_metadata, name)
127
127
  if headers is None:
128
128
  # generate automatic headers and treat the first row as data
129
- headers = [f"col_{idx+1}" for idx in range(len(headers_metadata))]
129
+ headers = [f"col_{idx + 1}" for idx in range(len(headers_metadata))]
130
130
  data_row_metadata = headers_metadata
131
131
  rows_data = values[0:]
132
132
  logger.warning(
@@ -149,12 +149,12 @@ def get_range_headers(headers_metadata: List[DictStrAny], range_name: str) -> Li
149
149
  header_val = str(f"col_{idx + 1}")
150
150
  else:
151
151
  logger.warning(
152
- f"In range {range_name}, header value: {header_val} at position {idx+1} is not a string!"
152
+ f"In range {range_name}, header value: {header_val} at position {idx + 1} is not a string!"
153
153
  )
154
154
  return None
155
155
  else:
156
156
  logger.warning(
157
- f"In range {range_name}, header at position {idx+1} is not missing!"
157
+ f"In range {range_name}, header at position {idx + 1} is not missing!"
158
158
  )
159
159
  return None
160
160
  headers.append(header_val)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.22
3
+ Version: 0.13.24
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -46,10 +46,10 @@ Requires-Dist: databricks-sqlalchemy==1.0.2
46
46
  Requires-Dist: dataclasses-json==0.6.7
47
47
  Requires-Dist: decorator==5.2.1
48
48
  Requires-Dist: deprecation==2.1.0
49
- Requires-Dist: dlt==1.6.1
49
+ Requires-Dist: dlt==1.9.0
50
50
  Requires-Dist: dnspython==2.7.0
51
- Requires-Dist: duckdb-engine==0.15.0
52
- Requires-Dist: duckdb==1.2.0
51
+ Requires-Dist: duckdb-engine==0.17.0
52
+ Requires-Dist: duckdb==1.2.1
53
53
  Requires-Dist: et-xmlfile==2.0.0
54
54
  Requires-Dist: facebook-business==20.0.0
55
55
  Requires-Dist: filelock==3.17.0
@@ -168,6 +168,7 @@ Requires-Dist: sqlalchemy-hana==2.0.0
168
168
  Requires-Dist: sqlalchemy-redshift==0.8.14
169
169
  Requires-Dist: sqlalchemy2-stubs==0.0.2a38
170
170
  Requires-Dist: sqlalchemy==1.4.52
171
+ Requires-Dist: sqlglot==26.12.1
171
172
  Requires-Dist: stripe==10.7.0
172
173
  Requires-Dist: tenacity==9.0.0
173
174
  Requires-Dist: thrift==0.16.0
@@ -1,11 +1,12 @@
1
- ingestr/main.py,sha256=74lbiWEa27MUKFPbyUNGIlrwD5fRxej5cKFwe_LX1pE,25452
1
+ ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
2
+ ingestr/main.py,sha256=wvbRCJ2--M0Zw2cYtSH874TxTtlD0wadHREeLG3anOY,25618
2
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
3
4
  ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
4
- ingestr/src/buildinfo.py,sha256=ExEPLDyz3-FkQx0OHsblNsR-B9G1fUx77cQtxlv6CXA,21
5
+ ingestr/src/buildinfo.py,sha256=x-bxDOuFDQ1rgDJf03eHD1bXb9Yfo3wX39XyaBE0LkU,21
5
6
  ingestr/src/destinations.py,sha256=vrGij4qMPCdXTMIimROWBJFqzOqCM4DFmgyubgSHejA,11279
6
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
7
8
  ingestr/src/factory.py,sha256=1jqcLv_QUUGeyg1OYN3ywrRdcDZyDRtMOongwyjDapU,5268
8
- ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
9
+ ingestr/src/filters.py,sha256=5LNpBgm8FJXdrFHGyM7dLVyphKykSpPk7yuQAZ8GML4,1133
9
10
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
10
11
  ingestr/src/partition.py,sha256=E0WHqh1FTheQAIVK_-jWUx0dgyYZCD1VxlAm362gao4,964
11
12
  ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
@@ -41,7 +42,7 @@ ingestr/src/filesystem/__init__.py,sha256=zkIwbRr0ir0EUdniI25p2zGiVc-7M9EmR351Aj
41
42
  ingestr/src/filesystem/helpers.py,sha256=bg0muSHZr3hMa8H4jN2-LGWzI-SUoKlQNiWJ74-YYms,3211
42
43
  ingestr/src/filesystem/readers.py,sha256=a0fKkaRpnAOGsXI3EBNYZa7x6tlmAOsgRzb883StY30,3987
43
44
  ingestr/src/github/__init__.py,sha256=xVijF-Wi4p88hkVJnKH-oTixismjD3aUcGqGa6Wr4e4,5889
44
- ingestr/src/github/helpers.py,sha256=Tmnik9811zBWNO6cJwV9PFQxEx2j32LHAQCvNbubsEI,6759
45
+ ingestr/src/github/helpers.py,sha256=rpv_3HzuOl4PQ-FUeA66pev-pgze9SaE8RUHIPYfZ_A,6759
45
46
  ingestr/src/github/queries.py,sha256=W34C02jUEdjFmOE7f7u9xvYyBNDMfVZAu0JIRZI2mkU,2302
46
47
  ingestr/src/github/settings.py,sha256=N5ahWrDIQ_4IWV9i-hTXxyYduqY9Ym2BTwqsWxcDdJ8,258
47
48
  ingestr/src/google_ads/__init__.py,sha256=bH0TtnRWcOUESezpvoA7VEUHAq_0ITGQeX4GGVBfl1I,3725
@@ -52,10 +53,10 @@ ingestr/src/google_ads/reports.py,sha256=AVY1pPt5yaIFskQe1k5VW2Dhlux3bzewsHlDrdG
52
53
  ingestr/src/google_analytics/__init__.py,sha256=8Evpmoy464YpNbCI_NmvFHIzWCu7J7SjJw-RrPZ6AL8,3674
53
54
  ingestr/src/google_analytics/helpers.py,sha256=vLmFyQ_IEJEK5LlxBJQeJw0VHaE5gRRZdBa54U72CaQ,5965
54
55
  ingestr/src/google_sheets/README.md,sha256=wFQhvmGpRA38Ba2N_WIax6duyD4c7c_pwvvprRfQDnw,5470
55
- ingestr/src/google_sheets/__init__.py,sha256=5qlX-6ilx5MW7klC7B_0jGSxloQSLkSESTh4nlY3Aos,6643
56
+ ingestr/src/google_sheets/__init__.py,sha256=CL0HfY74uxX8-ge0ucI0VhWMYZVAfoX7WRPBitRi-CI,6647
56
57
  ingestr/src/google_sheets/helpers/__init__.py,sha256=5hXZrZK8cMO3UOuL-s4OKOpdACdihQD0hYYlSEu-iQ8,35
57
58
  ingestr/src/google_sheets/helpers/api_calls.py,sha256=RiVfdacbaneszhmuhYilkJnkc9kowZvQUCUxz0G6SlI,5404
58
- ingestr/src/google_sheets/helpers/data_processing.py,sha256=WYO6z4XjGcG0Hat2J2enb-eLX5mSNVb2vaqRE83FBWU,11000
59
+ ingestr/src/google_sheets/helpers/data_processing.py,sha256=RNt2MYfdJhk4bRahnQVezpNg2x9z0vx60YFq2ukZ8vI,11004
59
60
  ingestr/src/gorgias/__init__.py,sha256=_mFkMYwlY5OKEY0o_FK1OKol03A-8uk7bm1cKlmt5cs,21432
60
61
  ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOINE,5427
61
62
  ingestr/src/hubspot/__init__.py,sha256=NYgSIAPXQh2Qp1eKun7TgcerKogq6pWtNkr-_f0FXbI,9464
@@ -118,8 +119,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
118
119
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
119
120
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
120
121
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
121
- ingestr-0.13.22.dist-info/METADATA,sha256=SC89LgkVuV22LAaSCETkDoT6bFYCgIkHjLgs2UP4q4c,13627
122
- ingestr-0.13.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
123
- ingestr-0.13.22.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
124
- ingestr-0.13.22.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
125
- ingestr-0.13.22.dist-info/RECORD,,
122
+ ingestr-0.13.24.dist-info/METADATA,sha256=qzKyEOTPIb6cTD49Q6zC-bqSn7ax45JmCKcGh0jtJRw,13659
123
+ ingestr-0.13.24.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
124
+ ingestr-0.13.24.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
125
+ ingestr-0.13.24.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
126
+ ingestr-0.13.24.dist-info/RECORD,,