omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
@@ -0,0 +1,108 @@
1
+ import time
2
+ from typing import Any, Iterator
3
+
4
+ BASE_URL = "https://api.dune.com/api/v1"
5
+
6
+
7
+ def poll_execution(session, headers: dict, execution_id: str) -> None:
8
+ max_retries = 8640 # Max 12 hours with 5-second intervals
9
+ retry_count = 0
10
+ poll_interval = 5
11
+
12
+ while retry_count < max_retries:
13
+ status_response = session.get(
14
+ f"{BASE_URL}/execution/{execution_id}/status",
15
+ headers=headers,
16
+ )
17
+ status_response.raise_for_status()
18
+ status_data = status_response.json()
19
+ state = status_data.get("state")
20
+
21
+ if state == "QUERY_STATE_COMPLETED":
22
+ return
23
+ elif state == "QUERY_STATE_FAILED":
24
+ error = status_data.get("error", {})
25
+ error_msg = (
26
+ error.get("message", "Unknown error")
27
+ if isinstance(error, dict)
28
+ else str(error)
29
+ )
30
+ raise ValueError(f"Query execution failed: {error_msg}")
31
+ elif state in ("QUERY_STATE_PENDING", "QUERY_STATE_EXECUTING"):
32
+ time.sleep(poll_interval)
33
+ retry_count += 1
34
+ elif state == "QUERY_STATE_CANCELLED":
35
+ raise ValueError("Query execution was cancelled")
36
+ elif state == "QUERY_STATE_EXPIRED":
37
+ raise ValueError("Query execution expired")
38
+ else:
39
+ raise ValueError(f"Unknown query state: {state}")
40
+
41
+ raise TimeoutError(
42
+ f"Query execution timed out after {max_retries * poll_interval} seconds"
43
+ )
44
+
45
+
46
+ def fetch_results(
47
+ session, headers: dict, execution_id: str
48
+ ) -> Iterator[dict[str, Any]]:
49
+ offset = 0
50
+ page_limit = 1000
51
+
52
+ while True:
53
+ params: dict[str, Any] = {
54
+ "limit": page_limit,
55
+ "offset": offset,
56
+ }
57
+
58
+ results_response = session.get(
59
+ f"{BASE_URL}/execution/{execution_id}/results",
60
+ headers=headers,
61
+ params=params,
62
+ )
63
+ results_response.raise_for_status()
64
+ results_data = results_response.json()
65
+
66
+ result = results_data.get("result", {})
67
+ rows = result.get("rows", [])
68
+
69
+ if not rows:
70
+ break
71
+
72
+ yield from rows
73
+
74
+ next_offset = results_data.get("next_offset")
75
+ if not next_offset:
76
+ break
77
+
78
+ offset = next_offset
79
+
80
+
81
+ def fetch_queries(session, headers: dict) -> Iterator[dict[str, Any]]:
82
+ offset = 0
83
+ page_limit = 100
84
+
85
+ while True:
86
+ params: dict[str, Any] = {
87
+ "limit": page_limit,
88
+ "offset": offset,
89
+ }
90
+
91
+ response = session.get(
92
+ f"{BASE_URL}/queries",
93
+ headers=headers,
94
+ params=params,
95
+ )
96
+ response.raise_for_status()
97
+ data = response.json()
98
+
99
+ rows = data.get("queries", [])
100
+ if not rows:
101
+ break
102
+
103
+ yield from rows
104
+
105
+ total = data.get("total", 0)
106
+ offset += len(rows)
107
+ if offset >= total:
108
+ break
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ import boto3
5
+ import dlt
6
+ from boto3.dynamodb.conditions import Attr
7
+ from dlt.common.configuration.specs import AwsCredentials
8
+
9
+ PAGINATION_KEY = "LastEvaluatedKey"
10
+ FILTER_KEY = "FilterExpression"
11
+ DATA_KEY = "Items"
12
+
13
+
14
+ @dataclass
15
+ class TableSchema:
16
+ primary_key: Optional[str]
17
+ sort_key: Optional[str]
18
+
19
+
20
+ def parseSchema(table) -> TableSchema:
21
+ schema = TableSchema(None, None)
22
+ for key in table.key_schema:
23
+ match key["KeyType"]:
24
+ case "HASH":
25
+ schema.primary_key = key["AttributeName"]
26
+ case "RANGE":
27
+ schema.sort_key = key["AttributeName"]
28
+
29
+ if schema.primary_key is None:
30
+ raise ValueError(f"Table {table.name} has no primary key!")
31
+
32
+ return schema
33
+
34
+
35
+ @dlt.source
36
+ def dynamodb(
37
+ table_name: str,
38
+ credentials: AwsCredentials,
39
+ incremental: Optional[dlt.sources.incremental] = None,
40
+ ):
41
+ sesh = boto3.Session(
42
+ aws_access_key_id=credentials.aws_access_key_id,
43
+ aws_secret_access_key=credentials.aws_secret_access_key,
44
+ region_name=credentials.region_name,
45
+ )
46
+ db = sesh.resource("dynamodb", endpoint_url=credentials.endpoint_url)
47
+ table = db.Table(table_name)
48
+ schema = parseSchema(table)
49
+ resource = dlt.resource(
50
+ dynamodb_table,
51
+ primary_key=schema.primary_key,
52
+ )
53
+
54
+ yield resource(table, incremental)
55
+
56
+
57
+ def dynamodb_table(
58
+ table,
59
+ incremental: Optional[dlt.sources.incremental] = None,
60
+ ):
61
+ args = build_scan_args(incremental)
62
+ scan = table.scan(**args)
63
+ while True:
64
+ yield from scan[DATA_KEY]
65
+ if PAGINATION_KEY not in scan:
66
+ break
67
+ scan = table.scan(ExclusiveStartKey=scan[PAGINATION_KEY], **args)
68
+
69
+
70
+ def build_scan_args(
71
+ incremental: Optional[dlt.sources.incremental] = None,
72
+ ):
73
+ scan_args = {}
74
+
75
+ if incremental is None:
76
+ return scan_args
77
+
78
+ if incremental.last_value:
79
+ criteria = Attr(incremental.cursor_path).gte(incremental.last_value)
80
+ if incremental.end_value:
81
+ criteria = Attr(incremental.cursor_path).between(
82
+ incremental.last_value, incremental.end_value
83
+ )
84
+ scan_args[FILTER_KEY] = criteria
85
+
86
+ return scan_args
@@ -0,0 +1,80 @@
1
+ from datetime import date, datetime
2
+ from typing import Any, Optional
3
+
4
+ import dlt
5
+ import pendulum
6
+ from dlt.common.time import ensure_pendulum_datetime
7
+ from pendulum import parse
8
+
9
+ from elasticsearch import Elasticsearch
10
+
11
+
12
+ @dlt.source
13
+ def elasticsearch_source(
14
+ connection_url: str,
15
+ index: str,
16
+ verify_certs: bool,
17
+ incremental: Optional[dlt.sources.incremental] = None,
18
+ ):
19
+ client = Elasticsearch(connection_url, verify_certs=verify_certs)
20
+
21
+ @dlt.resource(
22
+ name=index, primary_key="id", write_disposition="merge", incremental=incremental
23
+ )
24
+ def get_documents(incremental=incremental):
25
+ body = {"query": {"match_all": {}}}
26
+
27
+ if incremental:
28
+ start_value = incremental.last_value
29
+ range_filter = {"gte": start_value}
30
+ if incremental.end_value is not None:
31
+ range_filter["lt"] = incremental.end_value
32
+ body = {"query": {"range": {incremental.cursor_path: range_filter}}}
33
+
34
+ page = client.search(index=index, scroll="5m", size=1000, body=body)
35
+
36
+ sid = page["_scroll_id"]
37
+ hits = page["hits"]["hits"]
38
+
39
+ if not hits:
40
+ return
41
+
42
+ # fetching first page (via .search)
43
+ for doc in hits:
44
+ doc_data = {"id": doc["_id"], **doc["_source"]}
45
+ if incremental:
46
+ doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
47
+ doc_data[incremental.cursor_path]
48
+ )
49
+ yield doc_data
50
+
51
+ while True:
52
+ # fetching page 2 and other pages (via .scroll)
53
+ page = client.scroll(scroll_id=sid, scroll="5m")
54
+ sid = page["_scroll_id"]
55
+ hits = page["hits"]["hits"]
56
+ if not hits:
57
+ break
58
+ for doc in hits:
59
+ doc_data = {"id": doc["_id"], **doc["_source"]}
60
+ if incremental:
61
+ doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
62
+ doc_data[incremental.cursor_path]
63
+ )
64
+ yield doc_data
65
+
66
+ client.clear_scroll(scroll_id=sid)
67
+
68
+ return get_documents
69
+
70
+
71
+ def convert_elasticsearch_objs(value: Any) -> Any:
72
+ if isinstance(value, str):
73
+ parsed_date = parse(value, strict=False)
74
+ if parsed_date is not None:
75
+ if isinstance(
76
+ parsed_date,
77
+ (pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
78
+ ):
79
+ return ensure_pendulum_datetime(parsed_date)
80
+ return value
@@ -0,0 +1,141 @@
1
+ """Elasticsearch destination helpers"""
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any, Dict, Iterator, Set
6
+ from urllib.parse import urlparse
7
+
8
+ import dlt
9
+
10
+ from elasticsearch import Elasticsearch
11
+ from elasticsearch.helpers import bulk
12
+
13
+ # Suppress Elasticsearch transport logging
14
+ logging.getLogger("elasticsearch.transport").setLevel(logging.WARNING)
15
+ logging.getLogger("elastic_transport.transport").setLevel(logging.WARNING)
16
+
17
+ _cleared_indices: Set[str] = set()
18
+
19
+
20
+ def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
21
+ """Process items from a file path (JSONL format)."""
22
+ with open(file_path, "r") as f:
23
+ for line in f:
24
+ if line.strip():
25
+ doc = json.loads(line.strip())
26
+ # Clean DLT metadata
27
+ cleaned_doc = {
28
+ k: v for k, v in doc.items() if not k.startswith("_dlt_")
29
+ }
30
+ yield cleaned_doc
31
+
32
+
33
+ def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
34
+ """Process items from an iterable."""
35
+ for item in items:
36
+ if isinstance(item, dict):
37
+ # Clean DLT metadata
38
+ cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
39
+ yield cleaned_item
40
+
41
+
42
+ @dlt.destination(
43
+ name="elasticsearch",
44
+ loader_file_format="typed-jsonl",
45
+ batch_size=1000,
46
+ naming_convention="snake_case",
47
+ )
48
+ def elasticsearch_insert(
49
+ items, table, connection_string: str = dlt.secrets.value
50
+ ) -> None:
51
+ """Insert data into Elasticsearch index.
52
+
53
+ Args:
54
+ items: Data items (file path or iterable)
55
+ table: Table metadata containing name and schema info
56
+ connection_string: Elasticsearch connection string
57
+ """
58
+ # Parse connection string
59
+ parsed = urlparse(connection_string)
60
+
61
+ # Build Elasticsearch client configuration
62
+ actual_url = connection_string
63
+ secure = True # Default to HTTPS (secure by default)
64
+
65
+ if connection_string.startswith("elasticsearch://"):
66
+ actual_url = connection_string.replace("elasticsearch://", "")
67
+
68
+ # Parse to check for query parameters
69
+ temp_parsed = urlparse("http://" + actual_url)
70
+ from urllib.parse import parse_qs
71
+
72
+ query_params = parse_qs(temp_parsed.query)
73
+
74
+ # Check ?secure parameter (defaults to true)
75
+ if "secure" in query_params:
76
+ secure = query_params["secure"][0].lower() in ["true", "1", "yes"]
77
+
78
+ # Remove query params from URL for ES client
79
+ actual_url = actual_url.split("?")[0]
80
+
81
+ # Add scheme
82
+ scheme = "https" if secure else "http"
83
+ actual_url = f"{scheme}://{actual_url}"
84
+
85
+ parsed = urlparse(actual_url)
86
+
87
+ es_config: Dict[str, Any] = {
88
+ "hosts": [actual_url],
89
+ "verify_certs": secure,
90
+ "ssl_show_warn": False,
91
+ }
92
+
93
+ # Add authentication if present
94
+ if parsed.username and parsed.password:
95
+ es_config["http_auth"] = (parsed.username, parsed.password)
96
+
97
+ # Get index name from table metadata
98
+ index_name = table["name"]
99
+
100
+ # Connect to Elasticsearch
101
+ client = Elasticsearch(**es_config)
102
+
103
+ if index_name not in _cleared_indices:
104
+ if client.indices.exists(index=index_name):
105
+ client.indices.delete(index=index_name)
106
+ _cleared_indices.add(index_name)
107
+
108
+ # Process and insert documents
109
+ if isinstance(items, str):
110
+ documents = process_file_items(items)
111
+ else:
112
+ documents = process_iterable_items(items)
113
+
114
+ # Prepare documents for bulk insert as generator
115
+ def doc_generator():
116
+ for doc in documents:
117
+ es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
118
+
119
+ # Use _id if present, otherwise let ES generate one
120
+ if "_id" in doc:
121
+ es_doc["_id"] = str(doc["_id"])
122
+ # Remove _id from source since it's metadata
123
+ if "_id" in es_doc["_source"]:
124
+ del es_doc["_source"]["_id"]
125
+ elif "id" in doc:
126
+ es_doc["_id"] = str(doc["id"])
127
+
128
+ yield es_doc
129
+
130
+ # Bulk insert
131
+ try:
132
+ _, failed_items = bulk(client, doc_generator(), request_timeout=60)
133
+ if failed_items:
134
+ failed_count = (
135
+ len(failed_items) if isinstance(failed_items, list) else failed_items
136
+ )
137
+ raise Exception(
138
+ f"Failed to insert {failed_count} documents: {failed_items}"
139
+ )
140
+ except Exception as e:
141
+ raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")
omniload/src/errors.py ADDED
@@ -0,0 +1,26 @@
1
+ import requests
2
+
3
+
4
+ class MissingValueError(Exception):
5
+ def __init__(self, value, source):
6
+ super().__init__(f"{value} is required to connect to {source}")
7
+
8
+
9
+ class UnsupportedResourceError(Exception):
10
+ def __init__(self, resource, source):
11
+ super().__init__(
12
+ f"Resource '{resource}' is not supported for {source} source yet, if you are interested in it please create a GitHub issue at https://github.com/panodata/omniload"
13
+ )
14
+
15
+
16
+ class InvalidBlobTableError(Exception):
17
+ def __init__(self, source):
18
+ super().__init__(
19
+ f"Invalid source table for {source} "
20
+ "Ensure that the table is in the format {bucket-name}/{file glob}"
21
+ )
22
+
23
+
24
+ class HTTPError(Exception):
25
+ def __init__(self, source: requests.HTTPError):
26
+ super().__init__(f"HTTP {source.response.status_code}: {source.response.text}")