ingestr 0.12.9__py3-none-any.whl → 0.12.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/sources.py CHANGED
@@ -3,7 +3,8 @@ import csv
3
3
  import json
4
4
  import os
5
5
  import re
6
- from datetime import date, datetime, timedelta
6
+ import tempfile
7
+ from datetime import date, datetime, timedelta, timezone
7
8
  from typing import (
8
9
  Any,
9
10
  Callable,
@@ -18,8 +19,8 @@ from urllib.parse import ParseResult, parse_qs, quote, urlparse
18
19
 
19
20
  import dlt
20
21
  import gcsfs # type: ignore
21
- import s3fs # type: ignore
22
22
  import pendulum
23
+ import s3fs # type: ignore
23
24
  from dlt.common.configuration.specs import (
24
25
  AwsCredentials,
25
26
  )
@@ -41,9 +42,11 @@ from dlt.sources.sql_database.schema_types import (
41
42
  Table,
42
43
  TTypeAdapter,
43
44
  )
45
+ from google.ads.googleads.client import GoogleAdsClient # type: ignore
44
46
  from sqlalchemy import Column
45
47
  from sqlalchemy import types as sa
46
48
 
49
+ from ingestr.src import blob
47
50
  from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
48
51
  from ingestr.src.adjust.adjust_helpers import parse_filters
49
52
  from ingestr.src.airtable import airtable_source
@@ -55,6 +58,7 @@ from ingestr.src.asana_source import asana_source
55
58
  from ingestr.src.chess import source
56
59
  from ingestr.src.dynamodb import dynamodb
57
60
  from ingestr.src.errors import (
61
+ InvalidBlobTableError,
58
62
  MissingValueError,
59
63
  UnsupportedResourceError,
60
64
  )
@@ -62,6 +66,7 @@ from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_sour
62
66
  from ingestr.src.filesystem import readers
63
67
  from ingestr.src.filters import table_adapter_exclude_columns
64
68
  from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
69
+ from ingestr.src.google_ads import google_ads
65
70
  from ingestr.src.google_analytics import google_analytics
66
71
  from ingestr.src.google_sheets import google_spreadsheet
67
72
  from ingestr.src.gorgias import gorgias_source
@@ -69,6 +74,11 @@ from ingestr.src.hubspot import hubspot
69
74
  from ingestr.src.kafka import kafka_consumer
70
75
  from ingestr.src.kafka.helpers import KafkaCredentials
71
76
  from ingestr.src.klaviyo._init_ import klaviyo_source
77
+ from ingestr.src.linkedin_ads import linked_in_ads_source
78
+ from ingestr.src.linkedin_ads.dimension_time_enum import (
79
+ Dimension,
80
+ TimeGranularity,
81
+ )
72
82
  from ingestr.src.mongodb import mongodb_collection
73
83
  from ingestr.src.notion import notion_databases
74
84
  from ingestr.src.shopify import shopify_source
@@ -1090,16 +1100,11 @@ class S3Source:
1090
1100
  if not secret_access_key:
1091
1101
  raise ValueError("secret_access_key is required to connect to S3")
1092
1102
 
1093
- bucket_name = parsed_uri.hostname
1094
- if not bucket_name:
1095
- raise ValueError(
1096
- "Invalid S3 URI: The bucket name is missing. Ensure your S3 URI follows the format 's3://bucket-name"
1097
- )
1098
- bucket_url = f"s3://{bucket_name}"
1103
+ bucket_name, path_to_file = blob.parse_uri(parsed_uri, table)
1104
+ if not bucket_name or not path_to_file:
1105
+ raise InvalidBlobTableError("S3")
1099
1106
 
1100
- path_to_file = parsed_uri.path.lstrip("/") or table.lstrip("/")
1101
- if not path_to_file:
1102
- raise ValueError("--source-table must be specified")
1107
+ bucket_url = f"s3://{bucket_name}/"
1103
1108
 
1104
1109
  fs = s3fs.S3FileSystem(
1105
1110
  key=access_key_id[0],
@@ -1118,9 +1123,7 @@ class S3Source:
1118
1123
  "S3 Source only supports specific formats files: csv, jsonl, parquet"
1119
1124
  )
1120
1125
 
1121
- return readers(
1122
- bucket_url, fs, path_to_file
1123
- ).with_resources(endpoint)
1126
+ return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
1124
1127
 
1125
1128
 
1126
1129
  class TikTokSource:
@@ -1327,6 +1330,7 @@ class DynamoDBSource:
1327
1330
  range_start="closed",
1328
1331
  )
1329
1332
 
1333
+ # bug: we never validate table.
1330
1334
  return dynamodb(table, creds, incremental)
1331
1335
 
1332
1336
 
@@ -1517,6 +1521,13 @@ class GCSSource:
1517
1521
 
1518
1522
  parsed_uri = urlparse(uri)
1519
1523
  params = parse_qs(parsed_uri.query)
1524
+
1525
+ bucket_name, path_to_file = blob.parse_uri(parsed_uri, table)
1526
+ if not bucket_name or not path_to_file:
1527
+ raise InvalidBlobTableError("GCS")
1528
+
1529
+ bucket_url = f"gs://{bucket_name}"
1530
+
1520
1531
  credentials_path = params.get("credentials_path")
1521
1532
  credentials_base64 = params.get("credentials_base64")
1522
1533
  credentials_available = any(
@@ -1528,17 +1539,6 @@ class GCSSource:
1528
1539
  if credentials_available is False:
1529
1540
  raise MissingValueError("credentials_path or credentials_base64", "GCS")
1530
1541
 
1531
- bucket_name = parsed_uri.hostname
1532
- if not bucket_name:
1533
- raise ValueError(
1534
- "Invalid GCS URI: The bucket name is missing. Ensure your GCS URI follows the format 'gs://bucket-name/path/to/file"
1535
- )
1536
- bucket_url = f"gs://{bucket_name}/"
1537
-
1538
- path_to_file = parsed_uri.path.lstrip("/") or table.lstrip("/")
1539
- if not path_to_file:
1540
- raise ValueError("--source-table must be specified")
1541
-
1542
1542
  credentials = None
1543
1543
  if credentials_path:
1544
1544
  credentials = credentials_path[0]
@@ -1566,6 +1566,173 @@ class GCSSource:
1566
1566
  "GCS Source only supports specific formats files: csv, jsonl, parquet"
1567
1567
  )
1568
1568
 
1569
- return readers(
1570
- bucket_url, fs, path_to_file
1571
- ).with_resources(endpoint)
1569
+ return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
1570
+
1571
+ class GoogleAdsSource:
1572
+ def handles_incrementality(self) -> bool:
1573
+ return True
1574
+
1575
+ def init_client(self, params: Dict[str, List[str]]) -> GoogleAdsClient:
1576
+ dev_token = params.get("dev_token")
1577
+ if dev_token is None or len(dev_token) == 0:
1578
+ raise MissingValueError("dev_token", "Google Ads")
1579
+
1580
+ credentials_path = params.get("credentials_path")
1581
+ credentials_base64 = params.get("credentials_base64")
1582
+ credentials_available = any(
1583
+ map(
1584
+ lambda x: x is not None,
1585
+ [credentials_path, credentials_base64],
1586
+ )
1587
+ )
1588
+ if credentials_available is False:
1589
+ raise MissingValueError(
1590
+ "credentials_path or credentials_base64", "Google Ads"
1591
+ )
1592
+
1593
+ path = None
1594
+ fd = None
1595
+ if credentials_path:
1596
+ path = credentials_path[0]
1597
+ else:
1598
+ (fd, path) = tempfile.mkstemp(prefix="secret-")
1599
+ secret = base64.b64decode(credentials_base64[0]) # type: ignore
1600
+ os.write(fd, secret)
1601
+ os.close(fd)
1602
+
1603
+ conf = {
1604
+ "json_key_file_path": path,
1605
+ "use_proto_plus": True,
1606
+ "developer_token": dev_token[0],
1607
+ }
1608
+ try:
1609
+ client = GoogleAdsClient.load_from_dict(conf)
1610
+ finally:
1611
+ if fd is not None:
1612
+ os.remove(path)
1613
+
1614
+ return client
1615
+
1616
+ def dlt_source(self, uri: str, table: str, **kwargs):
1617
+ if kwargs.get("incremental_key") is not None:
1618
+ raise ValueError(
1619
+ "Google Ads takes care of incrementality on its own, you should not provide incremental_key"
1620
+ )
1621
+
1622
+ parsed_uri = urlparse(uri)
1623
+
1624
+ customer_id = parsed_uri.hostname
1625
+ if not customer_id:
1626
+ raise MissingValueError("customer_id", "Google Ads")
1627
+
1628
+ params = parse_qs(parsed_uri.query)
1629
+ client = self.init_client(params)
1630
+
1631
+ start_date = kwargs.get("interval_start") or datetime.now(
1632
+ tz=timezone.utc
1633
+ ) - timedelta(days=30)
1634
+ end_date = kwargs.get("interval_end")
1635
+
1636
+ # most combinations of explict start/end dates are automatically handled.
1637
+ # however, in the scenario where only the end date is provided, we need to
1638
+ # calculate the start date based on the end date.
1639
+ if (
1640
+ kwargs.get("interval_end") is not None
1641
+ and kwargs.get("interval_start") is None
1642
+ ):
1643
+ start_date = end_date - timedelta(days=30) # type: ignore
1644
+
1645
+ report_spec = None
1646
+ if table.startswith("daily:"):
1647
+ report_spec = table
1648
+ table = "daily_report"
1649
+
1650
+ src = google_ads(
1651
+ client,
1652
+ customer_id,
1653
+ report_spec,
1654
+ start_date=start_date,
1655
+ end_date=end_date,
1656
+ )
1657
+
1658
+ if table not in src.resources:
1659
+ raise UnsupportedResourceError(table, "Google Ads")
1660
+
1661
+ return src.with_resources(table)
1662
+
1663
+
1664
+ class LinkedInAdsSource:
1665
+ def handles_incrementality(self) -> bool:
1666
+ return True
1667
+
1668
+ def dlt_source(self, uri: str, table: str, **kwargs):
1669
+ parsed_uri = urlparse(uri)
1670
+ source_fields = parse_qs(parsed_uri.query)
1671
+
1672
+ access_token = source_fields.get("access_token")
1673
+ if not access_token:
1674
+ raise ValueError("access_token is required to connect to LinkedIn Ads")
1675
+
1676
+ account_ids = source_fields.get("account_ids")
1677
+
1678
+ if not account_ids:
1679
+ raise ValueError("account_ids is required to connect to LinkedIn Ads")
1680
+ account_ids = account_ids[0].replace(" ", "").split(",")
1681
+
1682
+ interval_start = kwargs.get("interval_start")
1683
+ interval_end = kwargs.get("interval_end")
1684
+ start_date = (
1685
+ ensure_pendulum_datetime(interval_start).date()
1686
+ if interval_start
1687
+ else pendulum.datetime(2018, 1, 1).date()
1688
+ )
1689
+ end_date = (
1690
+ ensure_pendulum_datetime(interval_end).date() if interval_end else None
1691
+ )
1692
+
1693
+ fields = table.split(":")
1694
+ if len(fields) != 3:
1695
+ raise ValueError(
1696
+ "Invalid table format. Expected format: custom:<dimensions>:<metrics>"
1697
+ )
1698
+
1699
+ dimensions = fields[1].replace(" ", "").split(",")
1700
+ dimensions = [item for item in dimensions if item.strip()]
1701
+ if (
1702
+ "campaign" not in dimensions
1703
+ and "creative" not in dimensions
1704
+ and "account" not in dimensions
1705
+ ):
1706
+ raise ValueError(
1707
+ "'campaign', 'creative' or 'account' is required to connect to LinkedIn Ads, please provide at least one of these dimensions."
1708
+ )
1709
+ if "date" not in dimensions and "month" not in dimensions:
1710
+ raise ValueError(
1711
+ "'date' or 'month' is required to connect to LinkedIn Ads, please provide at least one of these dimensions."
1712
+ )
1713
+
1714
+ if "date" in dimensions:
1715
+ time_granularity = TimeGranularity.daily
1716
+ dimensions.remove("date")
1717
+ else:
1718
+ time_granularity = TimeGranularity.monthly
1719
+ dimensions.remove("month")
1720
+
1721
+ dimension = Dimension[dimensions[0]]
1722
+
1723
+ metrics = fields[2].replace(" ", "").split(",")
1724
+ metrics = [item for item in metrics if item.strip()]
1725
+ if "dateRange" not in metrics:
1726
+ metrics.append("dateRange")
1727
+ if "pivotValues" not in metrics:
1728
+ metrics.append("pivotValues")
1729
+
1730
+ return linked_in_ads_source(
1731
+ start_date=start_date,
1732
+ end_date=end_date,
1733
+ access_token=access_token[0],
1734
+ account_ids=account_ids,
1735
+ dimension=dimension,
1736
+ metrics=metrics,
1737
+ time_granularity=time_granularity,
1738
+ ).with_resources("custom_reports")
ingestr/src/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.12.9"
1
+ __version__ = "0.12.11"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.12.9
3
+ Version: 0.12.11
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -22,7 +22,9 @@ Requires-Dist: dlt==1.5.0
22
22
  Requires-Dist: duckdb-engine==0.13.5
23
23
  Requires-Dist: duckdb==1.1.3
24
24
  Requires-Dist: facebook-business==20.0.0
25
+ Requires-Dist: flatten-json==0.1.14
25
26
  Requires-Dist: gcsfs==2024.10.0
27
+ Requires-Dist: google-ads==25.1.0
26
28
  Requires-Dist: google-analytics-data==0.18.16
27
29
  Requires-Dist: google-api-python-client==2.130.0
28
30
  Requires-Dist: google-cloud-bigquery-storage==2.24.0
@@ -1,13 +1,14 @@
1
1
  ingestr/main.py,sha256=fRWnyoPzMvvxTa61EIAP_dsKu0B_0yOwoyt0Slq9WQU,24723
2
2
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
3
+ ingestr/src/blob.py,sha256=XDk_XqmU_He4sQ1brY3ceoZgpq_ZBZihz1gHW9MzqUk,1381
3
4
  ingestr/src/destinations.py,sha256=zcHJIIHAZmcD9sJomd6G1Bc-1KsxnBD2aByOSV_9L3g,8850
4
- ingestr/src/errors.py,sha256=MrdLY5Gpr3g3qbYjl-U8-m8kxBJQOJo4ZVOsQpQbRR8,447
5
- ingestr/src/factory.py,sha256=oNF9dovovLG34xLgRZ5fbyA_XSHxEuTW27s1cb35KDM,4622
5
+ ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
6
+ ingestr/src/factory.py,sha256=D__Oy029z6y2OsAUMGab5K5ZmYhRXxDbD_SDc21b9Eo,4746
6
7
  ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
7
- ingestr/src/sources.py,sha256=JoO-IQ_eB4Ia1fC1GWs6N74l9A3tXQT-Fj0uNBiSI_Y,55978
8
+ ingestr/src/sources.py,sha256=jIq1qVj8_uOVbdrVuvs2uHkrLydd1i8XHMx5vhPVqAo,61682
8
9
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
9
10
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
10
- ingestr/src/version.py,sha256=FSGqM7DffUSCa5R2rqVlNo-yNzBd6cgAXS1_0tElLy0,23
11
+ ingestr/src/version.py,sha256=92OWM_xUUgc7wxFngCUAzVKFahsSWsF4UXOgDEn2uVI,24
11
12
  ingestr/src/adjust/__init__.py,sha256=ULjtJqrNS6XDvUyGl0tjl12-tLyXlCgeFe2icTbtu3Q,3255
12
13
  ingestr/src/adjust/adjust_helpers.py,sha256=av97NPSn-hQtTbAC0vUSCAWYePmOiG5R-DGdMssm7FQ,3646
13
14
  ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
@@ -37,6 +38,11 @@ ingestr/src/github/__init__.py,sha256=xVijF-Wi4p88hkVJnKH-oTixismjD3aUcGqGa6Wr4e
37
38
  ingestr/src/github/helpers.py,sha256=Tmnik9811zBWNO6cJwV9PFQxEx2j32LHAQCvNbubsEI,6759
38
39
  ingestr/src/github/queries.py,sha256=W34C02jUEdjFmOE7f7u9xvYyBNDMfVZAu0JIRZI2mkU,2302
39
40
  ingestr/src/github/settings.py,sha256=N5ahWrDIQ_4IWV9i-hTXxyYduqY9Ym2BTwqsWxcDdJ8,258
41
+ ingestr/src/google_ads/__init__.py,sha256=bH0TtnRWcOUESezpvoA7VEUHAq_0ITGQeX4GGVBfl1I,3725
42
+ ingestr/src/google_ads/field.py,sha256=uc8KEaYQrwgQoQPUdxIQWZxpFeZHbiV98FM0ZSaelS0,69
43
+ ingestr/src/google_ads/metrics.py,sha256=tAqpBpm-8l95oPT9cBxMWaEoDTNHVXnqUphYDHWKDiE,12099
44
+ ingestr/src/google_ads/predicates.py,sha256=K4wTuqfmJ9ko1RKeHTBDfQO_mUADVyuRqtywBPP-72w,683
45
+ ingestr/src/google_ads/reports.py,sha256=AVY1pPt5yaIFskQe1k5VW2Dhlux3bzewsHlDrdGEems,12686
40
46
  ingestr/src/google_analytics/__init__.py,sha256=8Evpmoy464YpNbCI_NmvFHIzWCu7J7SjJw-RrPZ6AL8,3674
41
47
  ingestr/src/google_analytics/helpers.py,sha256=vLmFyQ_IEJEK5LlxBJQeJw0VHaE5gRRZdBa54U72CaQ,5965
42
48
  ingestr/src/google_sheets/README.md,sha256=wFQhvmGpRA38Ba2N_WIax6duyD4c7c_pwvvprRfQDnw,5470
@@ -54,6 +60,9 @@ ingestr/src/kafka/helpers.py,sha256=V9WcVn3PKnEpggArHda4vnAcaV8VDuh__dSmRviJb5Y,
54
60
  ingestr/src/klaviyo/_init_.py,sha256=ucWHqBe8DQvXVpbmxKFAV5ljpCFb4ps_2QTD0OSiWxY,7905
55
61
  ingestr/src/klaviyo/client.py,sha256=tPj79ia7AW0ZOJhzlKNPCliGbdojRNwUFp8HvB2ym5s,7434
56
62
  ingestr/src/klaviyo/helpers.py,sha256=_i-SHffhv25feLDcjy6Blj1UxYLISCwVCMgGtrlnYHk,496
63
+ ingestr/src/linkedin_ads/__init__.py,sha256=CAPWFyV24loziiphbLmODxZUXZJwm4JxlFkr56q0jfo,1855
64
+ ingestr/src/linkedin_ads/dimension_time_enum.py,sha256=EmHRdkFyTAfo4chGjThrwqffWJxmAadZMbpTvf0xkQc,198
65
+ ingestr/src/linkedin_ads/helpers.py,sha256=6jSIp4DF0iUafJWU3Y7DbIJGKRH6hrx4S7zCTDOjNuE,4528
57
66
  ingestr/src/mongodb/__init__.py,sha256=aMr1PFIDUMRv--ne61lR17HudsN-fsrzMeyxe9PqK2s,4335
58
67
  ingestr/src/mongodb/helpers.py,sha256=y9rYKR8eyIqam_eNsZmwSYevgi8mghh7Zp8qhTHl65s,5652
59
68
  ingestr/src/notion/__init__.py,sha256=36wUui8finbc85ObkRMq8boMraXMUehdABN_AMe_hzA,1834
@@ -91,8 +100,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
91
100
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
92
101
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
93
102
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
94
- ingestr-0.12.9.dist-info/METADATA,sha256=p7RGcw0cnHPU93RLIPWOkMtj36Ax9BnA7bPSKIQ3pfg,8056
95
- ingestr-0.12.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
96
- ingestr-0.12.9.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
97
- ingestr-0.12.9.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
98
- ingestr-0.12.9.dist-info/RECORD,,
103
+ ingestr-0.12.11.dist-info/METADATA,sha256=fxNa7pb3GLEvLuUjHSOviflBwIBJto0ck1PyQp893jU,8127
104
+ ingestr-0.12.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
105
+ ingestr-0.12.11.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
106
+ ingestr-0.12.11.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
107
+ ingestr-0.12.11.dist-info/RECORD,,