ingestr 0.13.39__py3-none-any.whl → 0.13.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.39"
1
+ version = "v0.13.41"
@@ -0,0 +1,80 @@
1
+ from datetime import date, datetime
2
+ from typing import Any, Optional
3
+
4
+ import dlt
5
+ import pendulum
6
+ from dlt.common.time import ensure_pendulum_datetime
7
+ from pendulum import parse
8
+
9
+ from elasticsearch import Elasticsearch
10
+
11
+
12
+ @dlt.source
13
+ def elasticsearch_source(
14
+ connection_url: str,
15
+ index: str,
16
+ verify_certs: bool,
17
+ incremental: Optional[dlt.sources.incremental] = None,
18
+ ):
19
+ client = Elasticsearch(connection_url, verify_certs=verify_certs)
20
+
21
+ @dlt.resource(
22
+ name=index, primary_key="id", write_disposition="merge", incremental=incremental
23
+ )
24
+ def get_documents(incremental=incremental):
25
+ body = {"query": {"match_all": {}}}
26
+
27
+ if incremental:
28
+ start_value = incremental.last_value
29
+ range_filter = {"gte": start_value}
30
+ if incremental.end_value is not None:
31
+ range_filter["lt"] = incremental.end_value
32
+ body = {"query": {"range": {incremental.cursor_path: range_filter}}}
33
+
34
+ page = client.search(index=index, scroll="5m", size=5, body=body)
35
+
36
+ sid = page["_scroll_id"]
37
+ hits = page["hits"]["hits"]
38
+
39
+ if not hits:
40
+ return
41
+
42
+ # fetching first page (via .search)
43
+ for doc in hits:
44
+ doc_data = {"id": doc["_id"], **doc["_source"]}
45
+ if incremental:
46
+ doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
47
+ doc_data[incremental.cursor_path]
48
+ )
49
+ yield doc_data
50
+
51
+ while True:
52
+ # fetching page 2 and other pages (via .scroll)
53
+ page = client.scroll(scroll_id=sid, scroll="5m")
54
+ sid = page["_scroll_id"]
55
+ hits = page["hits"]["hits"]
56
+ if not hits:
57
+ break
58
+ for doc in hits:
59
+ doc_data = {"id": doc["_id"], **doc["_source"]}
60
+ if incremental:
61
+ doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
62
+ doc_data[incremental.cursor_path]
63
+ )
64
+ yield doc_data
65
+
66
+ client.clear_scroll(scroll_id=sid)
67
+
68
+ return get_documents
69
+
70
+
71
+ def convert_elasticsearch_objs(value: Any) -> Any:
72
+ if isinstance(value, str):
73
+ parsed_date = parse(value, strict=False)
74
+ if parsed_date is not None:
75
+ if isinstance(
76
+ parsed_date,
77
+ (pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
78
+ ):
79
+ return ensure_pendulum_datetime(parsed_date)
80
+ return value
ingestr/src/factory.py CHANGED
@@ -28,6 +28,7 @@ from ingestr.src.sources import (
28
28
  AsanaSource,
29
29
  ChessSource,
30
30
  DynamoDBSource,
31
+ ElasticsearchSource,
31
32
  FacebookAdsSource,
32
33
  FrankfurterSource,
33
34
  FreshdeskSource,
@@ -78,6 +79,7 @@ SQL_SOURCE_SCHEMES = [
78
79
  "clickhouse",
79
80
  "databricks",
80
81
  "db2",
82
+ "spanner",
81
83
  ]
82
84
 
83
85
 
@@ -153,6 +155,7 @@ class SourceDestinationFactory:
153
155
  "frankfurter": FrankfurterSource,
154
156
  "freshdesk": FreshdeskSource,
155
157
  "phantombuster": PhantombusterSource,
158
+ "elasticsearch": ElasticsearchSource,
156
159
  }
157
160
  destinations: Dict[str, Type[DestinationProtocol]] = {
158
161
  "bigquery": BigQueryDestination,
ingestr/src/sources.py CHANGED
@@ -52,7 +52,10 @@ class SqlSource:
52
52
  def dlt_source(self, uri: str, table: str, **kwargs):
53
53
  table_fields = TableDefinition(dataset="custom", table="custom")
54
54
  if not table.startswith("query:"):
55
- table_fields = table_string_to_dataclass(table)
55
+ if uri.startswith("spanner://"):
56
+ table_fields = TableDefinition(dataset="", table=table)
57
+ else:
58
+ table_fields = table_string_to_dataclass(table)
56
59
 
57
60
  incremental = None
58
61
  if kwargs.get("incremental_key"):
@@ -113,6 +116,45 @@ class SqlSource:
113
116
  if uri.startswith("db2://"):
114
117
  uri = uri.replace("db2://", "db2+ibm_db://")
115
118
 
119
+ if uri.startswith("spanner://"):
120
+ parsed_uri = urlparse(uri)
121
+ query_params = parse_qs(parsed_uri.query)
122
+
123
+ project_id_param = query_params.get("project_id")
124
+ instance_id_param = query_params.get("instance_id")
125
+ database_param = query_params.get("database")
126
+
127
+ cred_path = query_params.get("credentials_path")
128
+ cred_base64 = query_params.get("credentials_base64")
129
+
130
+ if not project_id_param or not instance_id_param or not database_param:
131
+ raise ValueError(
132
+ "project_id, instance_id and database are required in the URI to get data from Google Spanner"
133
+ )
134
+
135
+ project_id = project_id_param[0]
136
+ instance_id = instance_id_param[0]
137
+ database = database_param[0]
138
+
139
+ if not cred_path and not cred_base64:
140
+ raise ValueError(
141
+ "credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
142
+ )
143
+ if cred_path:
144
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path[0]
145
+ elif cred_base64:
146
+ credentials = json.loads(
147
+ base64.b64decode(cred_base64[0]).decode("utf-8")
148
+ )
149
+ temp = tempfile.NamedTemporaryFile(
150
+ mode="w", delete=False, suffix=".json"
151
+ )
152
+ json.dump(credentials, temp)
153
+ temp.close()
154
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp.name
155
+
156
+ uri = f"spanner+spanner:///projects/{project_id}/instances/{instance_id}/databases/{database}"
157
+
116
158
  from dlt.common.libs.sql_alchemy import (
117
159
  Engine,
118
160
  MetaData,
@@ -2298,3 +2340,55 @@ class PhantombusterSource:
2298
2340
  start_date=start_date,
2299
2341
  end_date=end_date,
2300
2342
  ).with_resources(table_name)
2343
+
2344
+
2345
+ class ElasticsearchSource:
2346
+ def handles_incrementality(self) -> bool:
2347
+ return False
2348
+
2349
+ def dlt_source(self, uri: str, table: str, **kwargs):
2350
+ from ingestr.src.elasticsearch import elasticsearch_source
2351
+
2352
+ incremental = None
2353
+ if kwargs.get("incremental_key"):
2354
+ start_value = kwargs.get("interval_start")
2355
+ end_value = kwargs.get("interval_end")
2356
+
2357
+ incremental = dlt_incremental(
2358
+ kwargs.get("incremental_key", ""),
2359
+ initial_value=start_value,
2360
+ end_value=end_value,
2361
+ range_end="closed",
2362
+ range_start="closed",
2363
+ )
2364
+
2365
+ # elasticsearch://localhost:9200?secure=true&verify_certs=false
2366
+ parsed = urlparse(uri)
2367
+
2368
+ index = table
2369
+ if not index:
2370
+ raise ValueError(
2371
+ "Table name must be provided which is the index name in elasticsearch"
2372
+ )
2373
+
2374
+ query_params = parsed.query
2375
+ params = parse_qs(query_params)
2376
+
2377
+ secure = True
2378
+ if "secure" in params:
2379
+ secure = params["secure"][0].capitalize() == "True"
2380
+
2381
+ verify_certs = True
2382
+ if "verify_certs" in params:
2383
+ verify_certs = params["verify_certs"][0].capitalize() == "True"
2384
+
2385
+ scheme = "https" if secure else "http"
2386
+ netloc = parsed.netloc
2387
+ connection_url = f"{scheme}://{netloc}"
2388
+
2389
+ return elasticsearch_source(
2390
+ connection_url=connection_url,
2391
+ index=index,
2392
+ verify_certs=verify_certs,
2393
+ incremental=incremental,
2394
+ ).with_resources(table)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.39
3
+ Version: 0.13.41
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -50,6 +50,8 @@ Requires-Dist: dlt==1.10.0
50
50
  Requires-Dist: dnspython==2.7.0
51
51
  Requires-Dist: duckdb-engine==0.17.0
52
52
  Requires-Dist: duckdb==1.2.1
53
+ Requires-Dist: elastic-transport==8.17.1
54
+ Requires-Dist: elasticsearch==8.10.1
53
55
  Requires-Dist: et-xmlfile==2.0.0
54
56
  Requires-Dist: facebook-business==20.0.0
55
57
  Requires-Dist: filelock==3.17.0
@@ -70,11 +72,14 @@ Requires-Dist: google-auth==2.38.0
70
72
  Requires-Dist: google-cloud-bigquery-storage==2.24.0
71
73
  Requires-Dist: google-cloud-bigquery==3.30.0
72
74
  Requires-Dist: google-cloud-core==2.4.2
75
+ Requires-Dist: google-cloud-spanner==3.54.0
73
76
  Requires-Dist: google-cloud-storage==3.1.0
74
77
  Requires-Dist: google-crc32c==1.6.0
75
78
  Requires-Dist: google-resumable-media==2.7.2
76
79
  Requires-Dist: googleapis-common-protos==1.69.0
77
80
  Requires-Dist: greenlet==3.2.2
81
+ Requires-Dist: grpc-google-iam-v1==0.14.2
82
+ Requires-Dist: grpc-interceptor==0.15.4
78
83
  Requires-Dist: grpcio-status==1.62.3
79
84
  Requires-Dist: grpcio==1.70.0
80
85
  Requires-Dist: hdbcli==2.23.27
@@ -166,9 +171,11 @@ Requires-Dist: soupsieve==2.6
166
171
  Requires-Dist: sqlalchemy-bigquery==1.12.1
167
172
  Requires-Dist: sqlalchemy-hana==2.0.0
168
173
  Requires-Dist: sqlalchemy-redshift==0.8.14
174
+ Requires-Dist: sqlalchemy-spanner==1.11.0
169
175
  Requires-Dist: sqlalchemy2-stubs==0.0.2a38
170
176
  Requires-Dist: sqlalchemy==1.4.52
171
177
  Requires-Dist: sqlglot==26.12.1
178
+ Requires-Dist: sqlparse==0.5.3
172
179
  Requires-Dist: stripe==10.7.0
173
180
  Requires-Dist: tenacity==9.0.0
174
181
  Requires-Dist: thrift==0.16.0
@@ -2,15 +2,15 @@ ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
2
2
  ingestr/main.py,sha256=Pe_rzwcDRKIYa7baEVUAAPOHyqQbX29RUexMl0F_S1k,25273
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
5
- ingestr/src/buildinfo.py,sha256=edyodue-Rkn4zTwWVR9OU0dSsDXVokKw_KKllrI1amM,21
5
+ ingestr/src/buildinfo.py,sha256=AK7sGUNx6CPDKJOXeMexFRun9bjpyv2c9t4DII73Pes,21
6
6
  ingestr/src/destinations.py,sha256=MctbeJUyNr0DRB0XYt2xAbEKkHZ40-nXXEOYCs4KuoE,15420
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
- ingestr/src/factory.py,sha256=j-FKRBEBZVLT_DEn-SCu9KEvaab3BchEV5hzTjpree8,5511
8
+ ingestr/src/factory.py,sha256=KJKIL9q7kU4oAVXy5o0wDwLAU0nG9y0xC8D7HzksYak,5597
9
9
  ingestr/src/filters.py,sha256=C-_TIVkF_cxZBgG-Run2Oyn0TAhJgA8IWXZ-OPY3uek,1136
10
10
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
11
11
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
12
12
  ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
13
- ingestr/src/sources.py,sha256=vppNI75ucM0EtW2kP5ldKyhc4Pij_hGVmKlZ9DNL4g0,79181
13
+ ingestr/src/sources.py,sha256=SWZAa6bokLurQRPtH7rxi8K-GSVLp_p9Ig1ArGRsxCo,82635
14
14
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
15
15
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
16
16
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -35,6 +35,7 @@ ingestr/src/chess/helpers.py,sha256=v1HTImOMjAF7AzZUPDIuHu00e7ut0o5y1kWcVYo4QZw,
35
35
  ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k,158
36
36
  ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
37
37
  ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
38
+ ingestr/src/elasticsearch/__init__.py,sha256=m-q93HgUmTwGDUwHOjHawstWL06TC3WIX3H05szybrY,2556
38
39
  ingestr/src/facebook_ads/__init__.py,sha256=reEpSr4BaKA1wO3qVgCH51gW-TgWkbJ_g24UIhJWbac,9286
39
40
  ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
40
41
  ingestr/src/facebook_ads/helpers.py,sha256=ZLbNHiKer5lPb4g3_435XeBJr57Wv0o1KTyBA1mQ100,9068
@@ -127,8 +128,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
127
128
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
128
129
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
129
130
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
130
- ingestr-0.13.39.dist-info/METADATA,sha256=goY5MW5AzJwYQ0cbwTmlNxZgCP1QRSt6ROmBOImESIM,13575
131
- ingestr-0.13.39.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- ingestr-0.13.39.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
133
- ingestr-0.13.39.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
134
- ingestr-0.13.39.dist-info/RECORD,,
131
+ ingestr-0.13.41.dist-info/METADATA,sha256=UzGBs9s0Kr6R1xji_ULG5Tuc383Klx2AIzfyZdXLBp4,13852
132
+ ingestr-0.13.41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
133
+ ingestr-0.13.41.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
134
+ ingestr-0.13.41.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
135
+ ingestr-0.13.41.dist-info/RECORD,,