ingestr 0.13.39__py3-none-any.whl → 0.13.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.39"
1
+ version = "v0.13.40"
@@ -0,0 +1,80 @@
1
+ from datetime import date, datetime
2
+ from typing import Any, Optional
3
+
4
+ import dlt
5
+ import pendulum
6
+ from dlt.common.time import ensure_pendulum_datetime
7
+ from pendulum import parse
8
+
9
+ from elasticsearch import Elasticsearch
10
+
11
+
12
+ @dlt.source
13
+ def elasticsearch_source(
14
+ connection_url: str,
15
+ index: str,
16
+ verify_certs: bool,
17
+ incremental: Optional[dlt.sources.incremental] = None,
18
+ ):
19
+ client = Elasticsearch(connection_url, verify_certs=verify_certs)
20
+
21
+ @dlt.resource(
22
+ name=index, primary_key="id", write_disposition="merge", incremental=incremental
23
+ )
24
+ def get_documents(incremental=incremental):
25
+ body = {"query": {"match_all": {}}}
26
+
27
+ if incremental:
28
+ start_value = incremental.last_value
29
+ range_filter = {"gte": start_value}
30
+ if incremental.end_value is not None:
31
+ range_filter["lt"] = incremental.end_value
32
+ body = {"query": {"range": {incremental.cursor_path: range_filter}}}
33
+
34
+ page = client.search(index=index, scroll="5m", size=5, body=body)
35
+
36
+ sid = page["_scroll_id"]
37
+ hits = page["hits"]["hits"]
38
+
39
+ if not hits:
40
+ return
41
+
42
+ # fetching first page (via .search)
43
+ for doc in hits:
44
+ doc_data = {"id": doc["_id"], **doc["_source"]}
45
+ if incremental:
46
+ doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
47
+ doc_data[incremental.cursor_path]
48
+ )
49
+ yield doc_data
50
+
51
+ while True:
52
+ # fetching page 2 and other pages (via .scroll)
53
+ page = client.scroll(scroll_id=sid, scroll="5m")
54
+ sid = page["_scroll_id"]
55
+ hits = page["hits"]["hits"]
56
+ if not hits:
57
+ break
58
+ for doc in hits:
59
+ doc_data = {"id": doc["_id"], **doc["_source"]}
60
+ if incremental:
61
+ doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
62
+ doc_data[incremental.cursor_path]
63
+ )
64
+ yield doc_data
65
+
66
+ client.clear_scroll(scroll_id=sid)
67
+
68
+ return get_documents
69
+
70
+
71
+ def convert_elasticsearch_objs(value: Any) -> Any:
72
+ if isinstance(value, str):
73
+ parsed_date = parse(value, strict=False)
74
+ if parsed_date is not None:
75
+ if isinstance(
76
+ parsed_date,
77
+ (pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
78
+ ):
79
+ return ensure_pendulum_datetime(parsed_date)
80
+ return value
ingestr/src/factory.py CHANGED
@@ -28,6 +28,7 @@ from ingestr.src.sources import (
28
28
  AsanaSource,
29
29
  ChessSource,
30
30
  DynamoDBSource,
31
+ ElasticsearchSource,
31
32
  FacebookAdsSource,
32
33
  FrankfurterSource,
33
34
  FreshdeskSource,
@@ -153,6 +154,7 @@ class SourceDestinationFactory:
153
154
  "frankfurter": FrankfurterSource,
154
155
  "freshdesk": FreshdeskSource,
155
156
  "phantombuster": PhantombusterSource,
157
+ "elasticsearch": ElasticsearchSource,
156
158
  }
157
159
  destinations: Dict[str, Type[DestinationProtocol]] = {
158
160
  "bigquery": BigQueryDestination,
ingestr/src/sources.py CHANGED
@@ -2298,3 +2298,53 @@ class PhantombusterSource:
2298
2298
  start_date=start_date,
2299
2299
  end_date=end_date,
2300
2300
  ).with_resources(table_name)
2301
+
2302
+
2303
+ class ElasticsearchSource:
2304
+ def handles_incrementality(self) -> bool:
2305
+ return False
2306
+
2307
+ def dlt_source(self, uri: str, table: str, **kwargs):
2308
+ from ingestr.src.elasticsearch import elasticsearch_source
2309
+
2310
+ incremental = None
2311
+ if kwargs.get("incremental_key"):
2312
+ start_value = kwargs.get("interval_start")
2313
+ end_value = kwargs.get("interval_end")
2314
+
2315
+ incremental = dlt_incremental(
2316
+ kwargs.get("incremental_key", ""),
2317
+ initial_value=start_value,
2318
+ end_value=end_value,
2319
+ range_end="closed",
2320
+ range_start="closed",
2321
+ )
2322
+
2323
+ # elasticsearch://localhost:9200?secure=true&verify_certs=false
2324
+ parsed = urlparse(uri)
2325
+
2326
+ index = table
2327
+ if not index:
2328
+ raise ValueError("Table name must be provided which is the index name in elasticsearch")
2329
+
2330
+ query_params = parsed.query
2331
+ params = parse_qs(query_params)
2332
+
2333
+ secure = True
2334
+ if "secure" in params:
2335
+ secure = params["secure"][0].capitalize() == "True"
2336
+
2337
+ verify_certs = True
2338
+ if "verify_certs" in params:
2339
+ verify_certs = params["verify_certs"][0].capitalize() == "True"
2340
+
2341
+ scheme = "https" if secure else "http"
2342
+ netloc = parsed.netloc
2343
+ connection_url = f"{scheme}://{netloc}"
2344
+
2345
+ return elasticsearch_source(
2346
+ connection_url=connection_url,
2347
+ index=index,
2348
+ verify_certs=verify_certs,
2349
+ incremental=incremental,
2350
+ ).with_resources(table)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.39
3
+ Version: 0.13.40
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -50,6 +50,8 @@ Requires-Dist: dlt==1.10.0
50
50
  Requires-Dist: dnspython==2.7.0
51
51
  Requires-Dist: duckdb-engine==0.17.0
52
52
  Requires-Dist: duckdb==1.2.1
53
+ Requires-Dist: elastic-transport==8.17.1
54
+ Requires-Dist: elasticsearch==8.10.1
53
55
  Requires-Dist: et-xmlfile==2.0.0
54
56
  Requires-Dist: facebook-business==20.0.0
55
57
  Requires-Dist: filelock==3.17.0
@@ -2,15 +2,15 @@ ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
2
2
  ingestr/main.py,sha256=Pe_rzwcDRKIYa7baEVUAAPOHyqQbX29RUexMl0F_S1k,25273
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
5
- ingestr/src/buildinfo.py,sha256=edyodue-Rkn4zTwWVR9OU0dSsDXVokKw_KKllrI1amM,21
5
+ ingestr/src/buildinfo.py,sha256=kx4THAPAkQ2P32re7w8VrOWSESl3Fz3mnN83MSygHeE,21
6
6
  ingestr/src/destinations.py,sha256=MctbeJUyNr0DRB0XYt2xAbEKkHZ40-nXXEOYCs4KuoE,15420
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
- ingestr/src/factory.py,sha256=j-FKRBEBZVLT_DEn-SCu9KEvaab3BchEV5hzTjpree8,5511
8
+ ingestr/src/factory.py,sha256=x-Ym3uHMgzj_svUk7Lopn3Jj-IhcQLCuDqA_eUPFLAI,5582
9
9
  ingestr/src/filters.py,sha256=C-_TIVkF_cxZBgG-Run2Oyn0TAhJgA8IWXZ-OPY3uek,1136
10
10
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
11
11
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
12
12
  ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
13
- ingestr/src/sources.py,sha256=vppNI75ucM0EtW2kP5ldKyhc4Pij_hGVmKlZ9DNL4g0,79181
13
+ ingestr/src/sources.py,sha256=RitbAjFVnq1I7MsjbD7hrn6Akd_92P6OCEg--YHivDw,80770
14
14
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
15
15
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
16
16
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -35,6 +35,7 @@ ingestr/src/chess/helpers.py,sha256=v1HTImOMjAF7AzZUPDIuHu00e7ut0o5y1kWcVYo4QZw,
35
35
  ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k,158
36
36
  ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
37
37
  ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
38
+ ingestr/src/elasticsearch/__init__.py,sha256=m-q93HgUmTwGDUwHOjHawstWL06TC3WIX3H05szybrY,2556
38
39
  ingestr/src/facebook_ads/__init__.py,sha256=reEpSr4BaKA1wO3qVgCH51gW-TgWkbJ_g24UIhJWbac,9286
39
40
  ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
40
41
  ingestr/src/facebook_ads/helpers.py,sha256=ZLbNHiKer5lPb4g3_435XeBJr57Wv0o1KTyBA1mQ100,9068
@@ -127,8 +128,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
127
128
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
128
129
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
129
130
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
130
- ingestr-0.13.39.dist-info/METADATA,sha256=goY5MW5AzJwYQ0cbwTmlNxZgCP1QRSt6ROmBOImESIM,13575
131
- ingestr-0.13.39.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- ingestr-0.13.39.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
133
- ingestr-0.13.39.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
134
- ingestr-0.13.39.dist-info/RECORD,,
131
+ ingestr-0.13.40.dist-info/METADATA,sha256=DV_PkyMFlK4isa37puXrTKAfFPb4oQ4_cKv-b1lojI4,13653
132
+ ingestr-0.13.40.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
133
+ ingestr-0.13.40.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
134
+ ingestr-0.13.40.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
135
+ ingestr-0.13.40.dist-info/RECORD,,