ingestr 0.13.39__py3-none-any.whl → 0.13.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/factory.py +2 -0
- ingestr/src/sources.py +50 -0
- {ingestr-0.13.39.dist-info → ingestr-0.13.40.dist-info}/METADATA +3 -1
- {ingestr-0.13.39.dist-info → ingestr-0.13.40.dist-info}/RECORD +9 -8
- {ingestr-0.13.39.dist-info → ingestr-0.13.40.dist-info}/WHEEL +0 -0
- {ingestr-0.13.39.dist-info → ingestr-0.13.40.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.39.dist-info → ingestr-0.13.40.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/buildinfo.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
version = "v0.13.
|
|
1
|
+
version = "v0.13.40"
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import dlt
|
|
5
|
+
import pendulum
|
|
6
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
7
|
+
from pendulum import parse
|
|
8
|
+
|
|
9
|
+
from elasticsearch import Elasticsearch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dlt.source
|
|
13
|
+
def elasticsearch_source(
|
|
14
|
+
connection_url: str,
|
|
15
|
+
index: str,
|
|
16
|
+
verify_certs: bool,
|
|
17
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
18
|
+
):
|
|
19
|
+
client = Elasticsearch(connection_url, verify_certs=verify_certs)
|
|
20
|
+
|
|
21
|
+
@dlt.resource(
|
|
22
|
+
name=index, primary_key="id", write_disposition="merge", incremental=incremental
|
|
23
|
+
)
|
|
24
|
+
def get_documents(incremental=incremental):
|
|
25
|
+
body = {"query": {"match_all": {}}}
|
|
26
|
+
|
|
27
|
+
if incremental:
|
|
28
|
+
start_value = incremental.last_value
|
|
29
|
+
range_filter = {"gte": start_value}
|
|
30
|
+
if incremental.end_value is not None:
|
|
31
|
+
range_filter["lt"] = incremental.end_value
|
|
32
|
+
body = {"query": {"range": {incremental.cursor_path: range_filter}}}
|
|
33
|
+
|
|
34
|
+
page = client.search(index=index, scroll="5m", size=5, body=body)
|
|
35
|
+
|
|
36
|
+
sid = page["_scroll_id"]
|
|
37
|
+
hits = page["hits"]["hits"]
|
|
38
|
+
|
|
39
|
+
if not hits:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# fetching first page (via .search)
|
|
43
|
+
for doc in hits:
|
|
44
|
+
doc_data = {"id": doc["_id"], **doc["_source"]}
|
|
45
|
+
if incremental:
|
|
46
|
+
doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
|
|
47
|
+
doc_data[incremental.cursor_path]
|
|
48
|
+
)
|
|
49
|
+
yield doc_data
|
|
50
|
+
|
|
51
|
+
while True:
|
|
52
|
+
# fetching page 2 and other pages (via .scroll)
|
|
53
|
+
page = client.scroll(scroll_id=sid, scroll="5m")
|
|
54
|
+
sid = page["_scroll_id"]
|
|
55
|
+
hits = page["hits"]["hits"]
|
|
56
|
+
if not hits:
|
|
57
|
+
break
|
|
58
|
+
for doc in hits:
|
|
59
|
+
doc_data = {"id": doc["_id"], **doc["_source"]}
|
|
60
|
+
if incremental:
|
|
61
|
+
doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
|
|
62
|
+
doc_data[incremental.cursor_path]
|
|
63
|
+
)
|
|
64
|
+
yield doc_data
|
|
65
|
+
|
|
66
|
+
client.clear_scroll(scroll_id=sid)
|
|
67
|
+
|
|
68
|
+
return get_documents
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def convert_elasticsearch_objs(value: Any) -> Any:
|
|
72
|
+
if isinstance(value, str):
|
|
73
|
+
parsed_date = parse(value, strict=False)
|
|
74
|
+
if parsed_date is not None:
|
|
75
|
+
if isinstance(
|
|
76
|
+
parsed_date,
|
|
77
|
+
(pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
|
|
78
|
+
):
|
|
79
|
+
return ensure_pendulum_datetime(parsed_date)
|
|
80
|
+
return value
|
ingestr/src/factory.py
CHANGED
|
@@ -28,6 +28,7 @@ from ingestr.src.sources import (
|
|
|
28
28
|
AsanaSource,
|
|
29
29
|
ChessSource,
|
|
30
30
|
DynamoDBSource,
|
|
31
|
+
ElasticsearchSource,
|
|
31
32
|
FacebookAdsSource,
|
|
32
33
|
FrankfurterSource,
|
|
33
34
|
FreshdeskSource,
|
|
@@ -153,6 +154,7 @@ class SourceDestinationFactory:
|
|
|
153
154
|
"frankfurter": FrankfurterSource,
|
|
154
155
|
"freshdesk": FreshdeskSource,
|
|
155
156
|
"phantombuster": PhantombusterSource,
|
|
157
|
+
"elasticsearch": ElasticsearchSource,
|
|
156
158
|
}
|
|
157
159
|
destinations: Dict[str, Type[DestinationProtocol]] = {
|
|
158
160
|
"bigquery": BigQueryDestination,
|
ingestr/src/sources.py
CHANGED
|
@@ -2298,3 +2298,53 @@ class PhantombusterSource:
|
|
|
2298
2298
|
start_date=start_date,
|
|
2299
2299
|
end_date=end_date,
|
|
2300
2300
|
).with_resources(table_name)
|
|
2301
|
+
|
|
2302
|
+
|
|
2303
|
+
class ElasticsearchSource:
|
|
2304
|
+
def handles_incrementality(self) -> bool:
|
|
2305
|
+
return False
|
|
2306
|
+
|
|
2307
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2308
|
+
from ingestr.src.elasticsearch import elasticsearch_source
|
|
2309
|
+
|
|
2310
|
+
incremental = None
|
|
2311
|
+
if kwargs.get("incremental_key"):
|
|
2312
|
+
start_value = kwargs.get("interval_start")
|
|
2313
|
+
end_value = kwargs.get("interval_end")
|
|
2314
|
+
|
|
2315
|
+
incremental = dlt_incremental(
|
|
2316
|
+
kwargs.get("incremental_key", ""),
|
|
2317
|
+
initial_value=start_value,
|
|
2318
|
+
end_value=end_value,
|
|
2319
|
+
range_end="closed",
|
|
2320
|
+
range_start="closed",
|
|
2321
|
+
)
|
|
2322
|
+
|
|
2323
|
+
# elasticsearch://localhost:9200?secure=true&verify_certs=false
|
|
2324
|
+
parsed = urlparse(uri)
|
|
2325
|
+
|
|
2326
|
+
index = table
|
|
2327
|
+
if not index:
|
|
2328
|
+
raise ValueError("Table name must be provided which is the index name in elasticsearch")
|
|
2329
|
+
|
|
2330
|
+
query_params = parsed.query
|
|
2331
|
+
params = parse_qs(query_params)
|
|
2332
|
+
|
|
2333
|
+
secure = True
|
|
2334
|
+
if "secure" in params:
|
|
2335
|
+
secure = params["secure"][0].capitalize() == "True"
|
|
2336
|
+
|
|
2337
|
+
verify_certs = True
|
|
2338
|
+
if "verify_certs" in params:
|
|
2339
|
+
verify_certs = params["verify_certs"][0].capitalize() == "True"
|
|
2340
|
+
|
|
2341
|
+
scheme = "https" if secure else "http"
|
|
2342
|
+
netloc = parsed.netloc
|
|
2343
|
+
connection_url = f"{scheme}://{netloc}"
|
|
2344
|
+
|
|
2345
|
+
return elasticsearch_source(
|
|
2346
|
+
connection_url=connection_url,
|
|
2347
|
+
index=index,
|
|
2348
|
+
verify_certs=verify_certs,
|
|
2349
|
+
incremental=incremental,
|
|
2350
|
+
).with_resources(table)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.40
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -50,6 +50,8 @@ Requires-Dist: dlt==1.10.0
|
|
|
50
50
|
Requires-Dist: dnspython==2.7.0
|
|
51
51
|
Requires-Dist: duckdb-engine==0.17.0
|
|
52
52
|
Requires-Dist: duckdb==1.2.1
|
|
53
|
+
Requires-Dist: elastic-transport==8.17.1
|
|
54
|
+
Requires-Dist: elasticsearch==8.10.1
|
|
53
55
|
Requires-Dist: et-xmlfile==2.0.0
|
|
54
56
|
Requires-Dist: facebook-business==20.0.0
|
|
55
57
|
Requires-Dist: filelock==3.17.0
|
|
@@ -2,15 +2,15 @@ ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
|
|
|
2
2
|
ingestr/main.py,sha256=Pe_rzwcDRKIYa7baEVUAAPOHyqQbX29RUexMl0F_S1k,25273
|
|
3
3
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
4
4
|
ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
|
|
5
|
-
ingestr/src/buildinfo.py,sha256=
|
|
5
|
+
ingestr/src/buildinfo.py,sha256=kx4THAPAkQ2P32re7w8VrOWSESl3Fz3mnN83MSygHeE,21
|
|
6
6
|
ingestr/src/destinations.py,sha256=MctbeJUyNr0DRB0XYt2xAbEKkHZ40-nXXEOYCs4KuoE,15420
|
|
7
7
|
ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
|
|
8
|
-
ingestr/src/factory.py,sha256=
|
|
8
|
+
ingestr/src/factory.py,sha256=x-Ym3uHMgzj_svUk7Lopn3Jj-IhcQLCuDqA_eUPFLAI,5582
|
|
9
9
|
ingestr/src/filters.py,sha256=C-_TIVkF_cxZBgG-Run2Oyn0TAhJgA8IWXZ-OPY3uek,1136
|
|
10
10
|
ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
|
|
11
11
|
ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
|
|
12
12
|
ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
|
|
13
|
-
ingestr/src/sources.py,sha256=
|
|
13
|
+
ingestr/src/sources.py,sha256=RitbAjFVnq1I7MsjbD7hrn6Akd_92P6OCEg--YHivDw,80770
|
|
14
14
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
15
15
|
ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
|
|
16
16
|
ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
|
|
@@ -35,6 +35,7 @@ ingestr/src/chess/helpers.py,sha256=v1HTImOMjAF7AzZUPDIuHu00e7ut0o5y1kWcVYo4QZw,
|
|
|
35
35
|
ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k,158
|
|
36
36
|
ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
|
|
37
37
|
ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
|
|
38
|
+
ingestr/src/elasticsearch/__init__.py,sha256=m-q93HgUmTwGDUwHOjHawstWL06TC3WIX3H05szybrY,2556
|
|
38
39
|
ingestr/src/facebook_ads/__init__.py,sha256=reEpSr4BaKA1wO3qVgCH51gW-TgWkbJ_g24UIhJWbac,9286
|
|
39
40
|
ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
|
|
40
41
|
ingestr/src/facebook_ads/helpers.py,sha256=ZLbNHiKer5lPb4g3_435XeBJr57Wv0o1KTyBA1mQ100,9068
|
|
@@ -127,8 +128,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
|
|
|
127
128
|
ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
|
|
128
129
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
129
130
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
130
|
-
ingestr-0.13.
|
|
131
|
-
ingestr-0.13.
|
|
132
|
-
ingestr-0.13.
|
|
133
|
-
ingestr-0.13.
|
|
134
|
-
ingestr-0.13.
|
|
131
|
+
ingestr-0.13.40.dist-info/METADATA,sha256=DV_PkyMFlK4isa37puXrTKAfFPb4oQ4_cKv-b1lojI4,13653
|
|
132
|
+
ingestr-0.13.40.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
133
|
+
ingestr-0.13.40.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
134
|
+
ingestr-0.13.40.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
135
|
+
ingestr-0.13.40.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|