omniload 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/conftest.py +72 -0
- omniload/main.py +810 -0
- omniload/src/.gitignore +10 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_sheets/README.md +95 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/loader.py +69 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/testdata/fakebqcredentials.json +14 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/version.py +6 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload/testdata/.gitignore +2 -0
- omniload/testdata/create_replace.csv +21 -0
- omniload/testdata/delete_insert_expected.csv +6 -0
- omniload/testdata/delete_insert_part1.csv +5 -0
- omniload/testdata/delete_insert_part2.csv +6 -0
- omniload/testdata/merge_expected.csv +5 -0
- omniload/testdata/merge_part1.csv +4 -0
- omniload/testdata/merge_part2.csv +5 -0
- omniload/tests/unit/test_smartsheets.py +133 -0
- omniload-0.0.0.dev0.dist-info/METADATA +439 -0
- omniload-0.0.0.dev0.dist-info/RECORD +218 -0
- omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
- omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any, Iterator
|
|
3
|
+
|
|
4
|
+
BASE_URL = "https://api.dune.com/api/v1"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def poll_execution(session, headers: dict, execution_id: str) -> None:
|
|
8
|
+
max_retries = 8640 # Max 12 hours with 5-second intervals
|
|
9
|
+
retry_count = 0
|
|
10
|
+
poll_interval = 5
|
|
11
|
+
|
|
12
|
+
while retry_count < max_retries:
|
|
13
|
+
status_response = session.get(
|
|
14
|
+
f"{BASE_URL}/execution/{execution_id}/status",
|
|
15
|
+
headers=headers,
|
|
16
|
+
)
|
|
17
|
+
status_response.raise_for_status()
|
|
18
|
+
status_data = status_response.json()
|
|
19
|
+
state = status_data.get("state")
|
|
20
|
+
|
|
21
|
+
if state == "QUERY_STATE_COMPLETED":
|
|
22
|
+
return
|
|
23
|
+
elif state == "QUERY_STATE_FAILED":
|
|
24
|
+
error = status_data.get("error", {})
|
|
25
|
+
error_msg = (
|
|
26
|
+
error.get("message", "Unknown error")
|
|
27
|
+
if isinstance(error, dict)
|
|
28
|
+
else str(error)
|
|
29
|
+
)
|
|
30
|
+
raise ValueError(f"Query execution failed: {error_msg}")
|
|
31
|
+
elif state in ("QUERY_STATE_PENDING", "QUERY_STATE_EXECUTING"):
|
|
32
|
+
time.sleep(poll_interval)
|
|
33
|
+
retry_count += 1
|
|
34
|
+
elif state == "QUERY_STATE_CANCELLED":
|
|
35
|
+
raise ValueError("Query execution was cancelled")
|
|
36
|
+
elif state == "QUERY_STATE_EXPIRED":
|
|
37
|
+
raise ValueError("Query execution expired")
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(f"Unknown query state: {state}")
|
|
40
|
+
|
|
41
|
+
raise TimeoutError(
|
|
42
|
+
f"Query execution timed out after {max_retries * poll_interval} seconds"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def fetch_results(
|
|
47
|
+
session, headers: dict, execution_id: str
|
|
48
|
+
) -> Iterator[dict[str, Any]]:
|
|
49
|
+
offset = 0
|
|
50
|
+
page_limit = 1000
|
|
51
|
+
|
|
52
|
+
while True:
|
|
53
|
+
params: dict[str, Any] = {
|
|
54
|
+
"limit": page_limit,
|
|
55
|
+
"offset": offset,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
results_response = session.get(
|
|
59
|
+
f"{BASE_URL}/execution/{execution_id}/results",
|
|
60
|
+
headers=headers,
|
|
61
|
+
params=params,
|
|
62
|
+
)
|
|
63
|
+
results_response.raise_for_status()
|
|
64
|
+
results_data = results_response.json()
|
|
65
|
+
|
|
66
|
+
result = results_data.get("result", {})
|
|
67
|
+
rows = result.get("rows", [])
|
|
68
|
+
|
|
69
|
+
if not rows:
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
yield from rows
|
|
73
|
+
|
|
74
|
+
next_offset = results_data.get("next_offset")
|
|
75
|
+
if not next_offset:
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
offset = next_offset
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def fetch_queries(session, headers: dict) -> Iterator[dict[str, Any]]:
|
|
82
|
+
offset = 0
|
|
83
|
+
page_limit = 100
|
|
84
|
+
|
|
85
|
+
while True:
|
|
86
|
+
params: dict[str, Any] = {
|
|
87
|
+
"limit": page_limit,
|
|
88
|
+
"offset": offset,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
response = session.get(
|
|
92
|
+
f"{BASE_URL}/queries",
|
|
93
|
+
headers=headers,
|
|
94
|
+
params=params,
|
|
95
|
+
)
|
|
96
|
+
response.raise_for_status()
|
|
97
|
+
data = response.json()
|
|
98
|
+
|
|
99
|
+
rows = data.get("queries", [])
|
|
100
|
+
if not rows:
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
yield from rows
|
|
104
|
+
|
|
105
|
+
total = data.get("total", 0)
|
|
106
|
+
offset += len(rows)
|
|
107
|
+
if offset >= total:
|
|
108
|
+
break
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
import dlt
|
|
6
|
+
from boto3.dynamodb.conditions import Attr
|
|
7
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
8
|
+
|
|
9
|
+
PAGINATION_KEY = "LastEvaluatedKey"
|
|
10
|
+
FILTER_KEY = "FilterExpression"
|
|
11
|
+
DATA_KEY = "Items"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class TableSchema:
|
|
16
|
+
primary_key: Optional[str]
|
|
17
|
+
sort_key: Optional[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parseSchema(table) -> TableSchema:
|
|
21
|
+
schema = TableSchema(None, None)
|
|
22
|
+
for key in table.key_schema:
|
|
23
|
+
match key["KeyType"]:
|
|
24
|
+
case "HASH":
|
|
25
|
+
schema.primary_key = key["AttributeName"]
|
|
26
|
+
case "RANGE":
|
|
27
|
+
schema.sort_key = key["AttributeName"]
|
|
28
|
+
|
|
29
|
+
if schema.primary_key is None:
|
|
30
|
+
raise ValueError(f"Table {table.name} has no primary key!")
|
|
31
|
+
|
|
32
|
+
return schema
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dlt.source
|
|
36
|
+
def dynamodb(
|
|
37
|
+
table_name: str,
|
|
38
|
+
credentials: AwsCredentials,
|
|
39
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
40
|
+
):
|
|
41
|
+
sesh = boto3.Session(
|
|
42
|
+
aws_access_key_id=credentials.aws_access_key_id,
|
|
43
|
+
aws_secret_access_key=credentials.aws_secret_access_key,
|
|
44
|
+
region_name=credentials.region_name,
|
|
45
|
+
)
|
|
46
|
+
db = sesh.resource("dynamodb", endpoint_url=credentials.endpoint_url)
|
|
47
|
+
table = db.Table(table_name)
|
|
48
|
+
schema = parseSchema(table)
|
|
49
|
+
resource = dlt.resource(
|
|
50
|
+
dynamodb_table,
|
|
51
|
+
primary_key=schema.primary_key,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
yield resource(table, incremental)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def dynamodb_table(
|
|
58
|
+
table,
|
|
59
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
60
|
+
):
|
|
61
|
+
args = build_scan_args(incremental)
|
|
62
|
+
scan = table.scan(**args)
|
|
63
|
+
while True:
|
|
64
|
+
yield from scan[DATA_KEY]
|
|
65
|
+
if PAGINATION_KEY not in scan:
|
|
66
|
+
break
|
|
67
|
+
scan = table.scan(ExclusiveStartKey=scan[PAGINATION_KEY], **args)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_scan_args(
|
|
71
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
72
|
+
):
|
|
73
|
+
scan_args = {}
|
|
74
|
+
|
|
75
|
+
if incremental is None:
|
|
76
|
+
return scan_args
|
|
77
|
+
|
|
78
|
+
if incremental.last_value:
|
|
79
|
+
criteria = Attr(incremental.cursor_path).gte(incremental.last_value)
|
|
80
|
+
if incremental.end_value:
|
|
81
|
+
criteria = Attr(incremental.cursor_path).between(
|
|
82
|
+
incremental.last_value, incremental.end_value
|
|
83
|
+
)
|
|
84
|
+
scan_args[FILTER_KEY] = criteria
|
|
85
|
+
|
|
86
|
+
return scan_args
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import dlt
|
|
5
|
+
import pendulum
|
|
6
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
7
|
+
from pendulum import parse
|
|
8
|
+
|
|
9
|
+
from elasticsearch import Elasticsearch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dlt.source
|
|
13
|
+
def elasticsearch_source(
|
|
14
|
+
connection_url: str,
|
|
15
|
+
index: str,
|
|
16
|
+
verify_certs: bool,
|
|
17
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
18
|
+
):
|
|
19
|
+
client = Elasticsearch(connection_url, verify_certs=verify_certs)
|
|
20
|
+
|
|
21
|
+
@dlt.resource(
|
|
22
|
+
name=index, primary_key="id", write_disposition="merge", incremental=incremental
|
|
23
|
+
)
|
|
24
|
+
def get_documents(incremental=incremental):
|
|
25
|
+
body = {"query": {"match_all": {}}}
|
|
26
|
+
|
|
27
|
+
if incremental:
|
|
28
|
+
start_value = incremental.last_value
|
|
29
|
+
range_filter = {"gte": start_value}
|
|
30
|
+
if incremental.end_value is not None:
|
|
31
|
+
range_filter["lt"] = incremental.end_value
|
|
32
|
+
body = {"query": {"range": {incremental.cursor_path: range_filter}}}
|
|
33
|
+
|
|
34
|
+
page = client.search(index=index, scroll="5m", size=1000, body=body)
|
|
35
|
+
|
|
36
|
+
sid = page["_scroll_id"]
|
|
37
|
+
hits = page["hits"]["hits"]
|
|
38
|
+
|
|
39
|
+
if not hits:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# fetching first page (via .search)
|
|
43
|
+
for doc in hits:
|
|
44
|
+
doc_data = {"id": doc["_id"], **doc["_source"]}
|
|
45
|
+
if incremental:
|
|
46
|
+
doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
|
|
47
|
+
doc_data[incremental.cursor_path]
|
|
48
|
+
)
|
|
49
|
+
yield doc_data
|
|
50
|
+
|
|
51
|
+
while True:
|
|
52
|
+
# fetching page 2 and other pages (via .scroll)
|
|
53
|
+
page = client.scroll(scroll_id=sid, scroll="5m")
|
|
54
|
+
sid = page["_scroll_id"]
|
|
55
|
+
hits = page["hits"]["hits"]
|
|
56
|
+
if not hits:
|
|
57
|
+
break
|
|
58
|
+
for doc in hits:
|
|
59
|
+
doc_data = {"id": doc["_id"], **doc["_source"]}
|
|
60
|
+
if incremental:
|
|
61
|
+
doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
|
|
62
|
+
doc_data[incremental.cursor_path]
|
|
63
|
+
)
|
|
64
|
+
yield doc_data
|
|
65
|
+
|
|
66
|
+
client.clear_scroll(scroll_id=sid)
|
|
67
|
+
|
|
68
|
+
return get_documents
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def convert_elasticsearch_objs(value: Any) -> Any:
|
|
72
|
+
if isinstance(value, str):
|
|
73
|
+
parsed_date = parse(value, strict=False)
|
|
74
|
+
if parsed_date is not None:
|
|
75
|
+
if isinstance(
|
|
76
|
+
parsed_date,
|
|
77
|
+
(pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
|
|
78
|
+
):
|
|
79
|
+
return ensure_pendulum_datetime(parsed_date)
|
|
80
|
+
return value
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Elasticsearch destination helpers"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Dict, Iterator, Set
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import dlt
|
|
9
|
+
|
|
10
|
+
from elasticsearch import Elasticsearch
|
|
11
|
+
from elasticsearch.helpers import bulk
|
|
12
|
+
|
|
13
|
+
# Suppress Elasticsearch transport logging
|
|
14
|
+
logging.getLogger("elasticsearch.transport").setLevel(logging.WARNING)
|
|
15
|
+
logging.getLogger("elastic_transport.transport").setLevel(logging.WARNING)
|
|
16
|
+
|
|
17
|
+
_cleared_indices: Set[str] = set()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
|
|
21
|
+
"""Process items from a file path (JSONL format)."""
|
|
22
|
+
with open(file_path, "r") as f:
|
|
23
|
+
for line in f:
|
|
24
|
+
if line.strip():
|
|
25
|
+
doc = json.loads(line.strip())
|
|
26
|
+
# Clean DLT metadata
|
|
27
|
+
cleaned_doc = {
|
|
28
|
+
k: v for k, v in doc.items() if not k.startswith("_dlt_")
|
|
29
|
+
}
|
|
30
|
+
yield cleaned_doc
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
|
|
34
|
+
"""Process items from an iterable."""
|
|
35
|
+
for item in items:
|
|
36
|
+
if isinstance(item, dict):
|
|
37
|
+
# Clean DLT metadata
|
|
38
|
+
cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
|
|
39
|
+
yield cleaned_item
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dlt.destination(
|
|
43
|
+
name="elasticsearch",
|
|
44
|
+
loader_file_format="typed-jsonl",
|
|
45
|
+
batch_size=1000,
|
|
46
|
+
naming_convention="snake_case",
|
|
47
|
+
)
|
|
48
|
+
def elasticsearch_insert(
|
|
49
|
+
items, table, connection_string: str = dlt.secrets.value
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Insert data into Elasticsearch index.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
items: Data items (file path or iterable)
|
|
55
|
+
table: Table metadata containing name and schema info
|
|
56
|
+
connection_string: Elasticsearch connection string
|
|
57
|
+
"""
|
|
58
|
+
# Parse connection string
|
|
59
|
+
parsed = urlparse(connection_string)
|
|
60
|
+
|
|
61
|
+
# Build Elasticsearch client configuration
|
|
62
|
+
actual_url = connection_string
|
|
63
|
+
secure = True # Default to HTTPS (secure by default)
|
|
64
|
+
|
|
65
|
+
if connection_string.startswith("elasticsearch://"):
|
|
66
|
+
actual_url = connection_string.replace("elasticsearch://", "")
|
|
67
|
+
|
|
68
|
+
# Parse to check for query parameters
|
|
69
|
+
temp_parsed = urlparse("http://" + actual_url)
|
|
70
|
+
from urllib.parse import parse_qs
|
|
71
|
+
|
|
72
|
+
query_params = parse_qs(temp_parsed.query)
|
|
73
|
+
|
|
74
|
+
# Check ?secure parameter (defaults to true)
|
|
75
|
+
if "secure" in query_params:
|
|
76
|
+
secure = query_params["secure"][0].lower() in ["true", "1", "yes"]
|
|
77
|
+
|
|
78
|
+
# Remove query params from URL for ES client
|
|
79
|
+
actual_url = actual_url.split("?")[0]
|
|
80
|
+
|
|
81
|
+
# Add scheme
|
|
82
|
+
scheme = "https" if secure else "http"
|
|
83
|
+
actual_url = f"{scheme}://{actual_url}"
|
|
84
|
+
|
|
85
|
+
parsed = urlparse(actual_url)
|
|
86
|
+
|
|
87
|
+
es_config: Dict[str, Any] = {
|
|
88
|
+
"hosts": [actual_url],
|
|
89
|
+
"verify_certs": secure,
|
|
90
|
+
"ssl_show_warn": False,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Add authentication if present
|
|
94
|
+
if parsed.username and parsed.password:
|
|
95
|
+
es_config["http_auth"] = (parsed.username, parsed.password)
|
|
96
|
+
|
|
97
|
+
# Get index name from table metadata
|
|
98
|
+
index_name = table["name"]
|
|
99
|
+
|
|
100
|
+
# Connect to Elasticsearch
|
|
101
|
+
client = Elasticsearch(**es_config)
|
|
102
|
+
|
|
103
|
+
if index_name not in _cleared_indices:
|
|
104
|
+
if client.indices.exists(index=index_name):
|
|
105
|
+
client.indices.delete(index=index_name)
|
|
106
|
+
_cleared_indices.add(index_name)
|
|
107
|
+
|
|
108
|
+
# Process and insert documents
|
|
109
|
+
if isinstance(items, str):
|
|
110
|
+
documents = process_file_items(items)
|
|
111
|
+
else:
|
|
112
|
+
documents = process_iterable_items(items)
|
|
113
|
+
|
|
114
|
+
# Prepare documents for bulk insert as generator
|
|
115
|
+
def doc_generator():
|
|
116
|
+
for doc in documents:
|
|
117
|
+
es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
|
|
118
|
+
|
|
119
|
+
# Use _id if present, otherwise let ES generate one
|
|
120
|
+
if "_id" in doc:
|
|
121
|
+
es_doc["_id"] = str(doc["_id"])
|
|
122
|
+
# Remove _id from source since it's metadata
|
|
123
|
+
if "_id" in es_doc["_source"]:
|
|
124
|
+
del es_doc["_source"]["_id"]
|
|
125
|
+
elif "id" in doc:
|
|
126
|
+
es_doc["_id"] = str(doc["id"])
|
|
127
|
+
|
|
128
|
+
yield es_doc
|
|
129
|
+
|
|
130
|
+
# Bulk insert
|
|
131
|
+
try:
|
|
132
|
+
_, failed_items = bulk(client, doc_generator(), request_timeout=60)
|
|
133
|
+
if failed_items:
|
|
134
|
+
failed_count = (
|
|
135
|
+
len(failed_items) if isinstance(failed_items, list) else failed_items
|
|
136
|
+
)
|
|
137
|
+
raise Exception(
|
|
138
|
+
f"Failed to insert {failed_count} documents: {failed_items}"
|
|
139
|
+
)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")
|
omniload/src/errors.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MissingValueError(Exception):
|
|
5
|
+
def __init__(self, value, source):
|
|
6
|
+
super().__init__(f"{value} is required to connect to {source}")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UnsupportedResourceError(Exception):
|
|
10
|
+
def __init__(self, resource, source):
|
|
11
|
+
super().__init__(
|
|
12
|
+
f"Resource '{resource}' is not supported for {source} source yet, if you are interested in it please create a GitHub issue at https://github.com/panodata/omniload"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class InvalidBlobTableError(Exception):
|
|
17
|
+
def __init__(self, source):
|
|
18
|
+
super().__init__(
|
|
19
|
+
f"Invalid source table for {source} "
|
|
20
|
+
"Ensure that the table is in the format {bucket-name}/{file glob}"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HTTPError(Exception):
|
|
25
|
+
def __init__(self, source: requests.HTTPError):
|
|
26
|
+
super().__init__(f"HTTP {source.response.status_code}: {source.response.text}")
|