ingestr 0.13.55__py3-none-any.whl → 0.13.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/blob.py CHANGED
@@ -6,6 +6,10 @@ BucketName: TypeAlias = str
6
6
  FileGlob: TypeAlias = str
7
7
 
8
8
 
9
+ class UnsupportedEndpointError(Exception):
10
+ pass
11
+
12
+
9
13
  def parse_uri(uri: ParseResult, table: str) -> Tuple[BucketName, FileGlob]:
10
14
  """
11
15
  parse the URI of a blob storage and
@@ -50,3 +54,23 @@ def parse_uri(uri: ParseResult, table: str) -> Tuple[BucketName, FileGlob]:
50
54
  return "", parts[0]
51
55
 
52
56
  return parts[0], parts[1]
57
+
58
+
59
+ def parse_endpoint(path: str) -> str:
60
+ """
61
+ Parse the endpoint kind from the URI.
62
+
63
+ kind is a file format. one of [csv, jsonl, parquet]
64
+ """
65
+ file_extension = path.split(".")[-1]
66
+ if file_extension == "gz":
67
+ file_extension = path.split(".")[-2]
68
+ if file_extension == "csv":
69
+ endpoint = "read_csv"
70
+ elif file_extension == "jsonl":
71
+ endpoint = "read_jsonl"
72
+ elif file_extension == "parquet":
73
+ endpoint = "read_parquet"
74
+ else:
75
+ raise UnsupportedEndpointError(f"Unsupported file format: {file_extension}")
76
+ return endpoint
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.55"
1
+ version = "v0.13.56"
ingestr/src/factory.py CHANGED
@@ -64,6 +64,7 @@ from ingestr.src.sources import (
64
64
  SqlSource,
65
65
  StripeAnalyticsSource,
66
66
  TikTokSource,
67
+ TrustpilotSource,
67
68
  ZendeskSource,
68
69
  )
69
70
 
@@ -165,6 +166,7 @@ class SourceDestinationFactory:
165
166
  "pipedrive": PipedriveSource,
166
167
  "frankfurter": FrankfurterSource,
167
168
  "freshdesk": FreshdeskSource,
169
+ "trustpilot": TrustpilotSource,
168
170
  "phantombuster": PhantombusterSource,
169
171
  "elasticsearch": ElasticsearchSource,
170
172
  "attio": AttioSource,
ingestr/src/sources.py CHANGED
@@ -1362,17 +1362,25 @@ class S3Source:
1362
1362
  secret=secret_access_key[0],
1363
1363
  )
1364
1364
 
1365
- file_extension = path_to_file.split(".")[-1]
1366
- if file_extension == "csv":
1367
- endpoint = "read_csv"
1368
- elif file_extension == "jsonl":
1369
- endpoint = "read_jsonl"
1370
- elif file_extension == "parquet":
1371
- endpoint = "read_parquet"
1365
+ endpoint: Optional[str] = None
1366
+ if "#" in table:
1367
+ _, endpoint = table.split("#")
1368
+ if endpoint not in ["csv", "jsonl", "parquet"]:
1369
+ raise ValueError(
1370
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1371
+ )
1372
+ endpoint = f"read_{endpoint}"
1372
1373
  else:
1373
- raise ValueError(
1374
- "S3 Source only supports specific formats files: csv, jsonl, parquet"
1375
- )
1374
+ try:
1375
+ endpoint = blob.parse_endpoint(path_to_file)
1376
+ except blob.UnsupportedEndpointError:
1377
+ raise ValueError(
1378
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1379
+ )
1380
+ except Exception as e:
1381
+ raise ValueError(
1382
+ f"Failed to parse endpoint from path: {path_to_file}"
1383
+ ) from e
1376
1384
 
1377
1385
  from ingestr.src.filesystem import readers
1378
1386
 
@@ -1844,17 +1852,16 @@ class GCSSource:
1844
1852
  token=credentials,
1845
1853
  )
1846
1854
 
1847
- file_extension = path_to_file.split(".")[-1]
1848
- if file_extension == "csv":
1849
- endpoint = "read_csv"
1850
- elif file_extension == "jsonl":
1851
- endpoint = "read_jsonl"
1852
- elif file_extension == "parquet":
1853
- endpoint = "read_parquet"
1854
- else:
1855
+ try:
1856
+ endpoint = blob.parse_endpoint(path_to_file)
1857
+ except blob.UnsupportedEndpointError:
1855
1858
  raise ValueError(
1856
- "GCS Source only supports specific formats files: csv, jsonl, parquet"
1859
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1857
1860
  )
1861
+ except Exception as e:
1862
+ raise ValueError(
1863
+ f"Failed to parse endpoint from path: {path_to_file}"
1864
+ ) from e
1858
1865
 
1859
1866
  from ingestr.src.filesystem import readers
1860
1867
 
@@ -2392,6 +2399,47 @@ class FreshdeskSource:
2392
2399
  ).with_resources(table)
2393
2400
 
2394
2401
 
2402
+ class TrustpilotSource:
2403
+ # trustpilot://<business_unit_id>?api_key=<api_key>
2404
+ def handles_incrementality(self) -> bool:
2405
+ return True
2406
+
2407
+ def dlt_source(self, uri: str, table: str, **kwargs):
2408
+ parsed_uri = urlparse(uri)
2409
+ business_unit_id = parsed_uri.netloc
2410
+ params = parse_qs(parsed_uri.query)
2411
+
2412
+ if not business_unit_id:
2413
+ raise MissingValueError("business_unit_id", "Trustpilot")
2414
+
2415
+ api_key = params.get("api_key")
2416
+ if api_key is None:
2417
+ raise MissingValueError("api_key", "Trustpilot")
2418
+
2419
+ start_date = kwargs.get("interval_start")
2420
+ if start_date is None:
2421
+ start_date = ensure_pendulum_datetime("2000-01-01").in_tz("UTC").isoformat()
2422
+ else:
2423
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC").isoformat()
2424
+
2425
+ end_date = kwargs.get("interval_end")
2426
+
2427
+ if end_date is not None:
2428
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC").isoformat()
2429
+
2430
+ if table not in ["reviews"]:
2431
+ raise UnsupportedResourceError(table, "Trustpilot")
2432
+
2433
+ from ingestr.src.trustpilot import trustpilot_source
2434
+
2435
+ return trustpilot_source(
2436
+ business_unit_id=business_unit_id,
2437
+ api_key=api_key[0],
2438
+ start_date=start_date,
2439
+ end_date=end_date,
2440
+ ).with_resources(table)
2441
+
2442
+
2395
2443
  class PhantombusterSource:
2396
2444
  def handles_incrementality(self) -> bool:
2397
2445
  return True
@@ -2622,18 +2670,15 @@ class SFTPSource:
2622
2670
  else:
2623
2671
  file_glob = f"/{table}"
2624
2672
 
2625
- file_extension = table.split(".")[-1].lower()
2626
- endpoint: str
2627
- if file_extension == "csv":
2628
- endpoint = "read_csv"
2629
- elif file_extension == "jsonl":
2630
- endpoint = "read_jsonl"
2631
- elif file_extension == "parquet":
2632
- endpoint = "read_parquet"
2633
- else:
2673
+ try:
2674
+ endpoint = blob.parse_endpoint(table)
2675
+ except blob.UnsupportedEndpointError:
2634
2676
  raise ValueError(
2635
- "FTPServer Source only supports specific file formats: csv, jsonl, parquet."
2677
+ "SFTP Source only supports specific formats files: csv, jsonl, parquet"
2636
2678
  )
2679
+ except Exception as e:
2680
+ raise ValueError(f"Failed to parse endpoint from path: {table}") from e
2681
+
2637
2682
  from ingestr.src.filesystem import readers
2638
2683
 
2639
2684
  dlt_source_resource = readers(bucket_url, fs, file_glob)
@@ -0,0 +1,48 @@
1
+ """Trustpilot source for ingesting reviews."""
2
+
3
+ from typing import Any, Dict, Generator, Iterable
4
+
5
+ import dlt
6
+ import pendulum
7
+ from dlt.sources import DltResource
8
+
9
+ from .client import TrustpilotClient
10
+
11
+
12
+ @dlt.source()
13
+ def trustpilot_source(
14
+ business_unit_id: str,
15
+ start_date: str,
16
+ end_date: str | None,
17
+ api_key: str,
18
+ per_page: int = 1000,
19
+ ) -> Iterable[DltResource]:
20
+ """Return resources for Trustpilot."""
21
+
22
+ client = TrustpilotClient(api_key=api_key)
23
+
24
+ @dlt.resource(name="reviews", write_disposition="merge", primary_key="id")
25
+ def reviews(
26
+ dateTime=(
27
+ dlt.sources.incremental(
28
+ "updated_at",
29
+ initial_value=start_date,
30
+ end_value=end_date,
31
+ range_start="closed",
32
+ range_end="closed",
33
+ )
34
+ ),
35
+ ) -> Generator[Dict[str, Any], None, None]:
36
+ if end_date is None:
37
+ end_dt = pendulum.now(tz="UTC").isoformat()
38
+ else:
39
+ end_dt = dateTime.end_value
40
+ start_dt = dateTime.last_value
41
+ yield from client.paginated_reviews(
42
+ business_unit_id=business_unit_id,
43
+ per_page=per_page,
44
+ updated_since=start_dt,
45
+ end_date=end_dt,
46
+ )
47
+
48
+ yield reviews
@@ -0,0 +1,48 @@
1
+ """Simple Trustpilot API client."""
2
+
3
+ from typing import Any, Dict, Iterable
4
+
5
+ import pendulum
6
+ from dlt.sources.helpers import requests
7
+
8
+
9
+ class TrustpilotClient:
10
+ """Client for the Trustpilot public API."""
11
+
12
+ def __init__(self, api_key: str) -> None:
13
+ self.api_key = api_key
14
+ self.base_url = "https://api.trustpilot.com/v1"
15
+
16
+ def _get(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]:
17
+ params = dict(params)
18
+ params["apikey"] = self.api_key
19
+ response = requests.get(f"{self.base_url}{endpoint}", params=params)
20
+ response.raise_for_status()
21
+ return response.json()
22
+
23
+ def paginated_reviews(
24
+ self,
25
+ business_unit_id: str,
26
+ updated_since: str,
27
+ end_date: str,
28
+ per_page: int = 1000,
29
+ ) -> Iterable[Dict[str, Any]]:
30
+ page = 1
31
+ while True:
32
+ params: Dict[str, Any] = {"perPage": per_page, "page": page}
33
+ if updated_since:
34
+ params["updatedSince"] = updated_since
35
+ data = self._get(f"/business-units/{business_unit_id}/reviews", params)
36
+ reviews = data.get("reviews", data)
37
+ if not reviews:
38
+ break
39
+ for review in reviews:
40
+ end_date_dt = pendulum.parse(end_date)
41
+ review["updated_at"] = review["updatedAt"]
42
+ review_dt = pendulum.parse(review["updated_at"])
43
+ if review_dt > end_date_dt: # type: ignore
44
+ continue
45
+ yield review
46
+ if len(reviews) < per_page:
47
+ break
48
+ page += 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.55
3
+ Version: 0.13.56
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -1,17 +1,17 @@
1
1
  ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
2
2
  ingestr/main.py,sha256=GkC1hdq8AVGrvolc95zMfjmibI95p2pmFkbgCOVf-Og,26311
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
- ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
5
- ingestr/src/buildinfo.py,sha256=bdi0-mZhnHheYgs6WuEb8p-RIk_RFAXRCF9HalRfV0k,21
4
+ ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
5
+ ingestr/src/buildinfo.py,sha256=xHWz596_bblLkASY5eAURBFkKuYtb-7IoI3_4X9OIZM,21
6
6
  ingestr/src/destinations.py,sha256=TcxM2rcwHfgMMP6U0yRNcfWKlEzkBbZbqCIDww7lkTY,16882
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
- ingestr/src/factory.py,sha256=mcjgbmrZr6TvP9fCMQxo-aMGcrb2PqToRcSLp5nldww,6138
8
+ ingestr/src/factory.py,sha256=R7KzGRQ9tYZ_N-daD9OtnEp0K-DrsP8bUyXWdv4LV4A,6200
9
9
  ingestr/src/filters.py,sha256=LLecXe9QkLFkFLUZ92OXNdcANr1a8edDxrflc2ko_KA,1452
10
10
  ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
11
11
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
12
12
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
13
13
  ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
14
- ingestr/src/sources.py,sha256=3ozLt9lhhNANspfjA2vb8u6qjgBJezH8QBV1XKqT1fg,94124
14
+ ingestr/src/sources.py,sha256=_1iodwR8UC0MtlnJr6y45eMWCcUwXKXSqJMzYsBizXo,95759
15
15
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
16
16
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
17
17
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -124,6 +124,8 @@ ingestr/src/telemetry/event.py,sha256=W7bs4uVfPakQ5otmiqgqu1l5SqjYx1p87wudnWXckB
124
124
  ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
125
125
  ingestr/src/tiktok_ads/__init__.py,sha256=aEqCl3dTH6_d43s1jgAeG1UasEls_SlorORulYMwIL8,4590
126
126
  ingestr/src/tiktok_ads/tiktok_helpers.py,sha256=jmWHvZzN1Vt_PWrJkgq5a2wIwon-OBEzXoZx0jEy-74,3905
127
+ ingestr/src/trustpilot/__init__.py,sha256=ofhjep4qRPIi8q41qc97QVex8UbWF-Fd7gUsqeQlQX8,1279
128
+ ingestr/src/trustpilot/client.py,sha256=zKYt5C7nrR83Id0KN49EPmtml8MEtlSPlAosEFU3VXY,1616
127
129
  ingestr/src/zendesk/__init__.py,sha256=tmJ_jdb6kpwmEKpcv6Im71-bOZI6h-Tcofe18OH4I24,17762
128
130
  ingestr/src/zendesk/settings.py,sha256=Vdj706nTJFQ-3KH4nO97iYCQuba3dV3E9gfnmLK6xwU,2294
129
131
  ingestr/src/zendesk/helpers/__init__.py,sha256=YTJejCiUjfIcsj9FrkY0l-JGYDI7RRte1Ydq5FDH_0c,888
@@ -139,8 +141,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
139
141
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
140
142
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
141
143
  ingestr/tests/unit/test_smartsheets.py,sha256=eiC2CCO4iNJcuN36ONvqmEDryCA1bA1REpayHpu42lk,5058
142
- ingestr-0.13.55.dist-info/METADATA,sha256=WNMM4qLCTDJg4xUnYNefHffB6vidRl4xopoBaaux-FM,15131
143
- ingestr-0.13.55.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
144
- ingestr-0.13.55.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
145
- ingestr-0.13.55.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
146
- ingestr-0.13.55.dist-info/RECORD,,
144
+ ingestr-0.13.56.dist-info/METADATA,sha256=YleGPh8oMkcEXHKFXIHIgSyUeu9p53rkynuyC4uiKMw,15131
145
+ ingestr-0.13.56.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
146
+ ingestr-0.13.56.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
147
+ ingestr-0.13.56.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
148
+ ingestr-0.13.56.dist-info/RECORD,,