ingestr 0.14.94__py3-none-any.whl → 0.14.96__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.14.94"
1
+ version = "v0.14.96"
ingestr/src/factory.py CHANGED
@@ -84,6 +84,7 @@ from ingestr.src.sources import (
84
84
  ShopifySource,
85
85
  SlackSource,
86
86
  SmartsheetSource,
87
+ SocrataSource,
87
88
  SolidgateSource,
88
89
  SqlSource,
89
90
  StripeAnalyticsSource,
@@ -218,6 +219,7 @@ class SourceDestinationFactory:
218
219
  "sftp": SFTPSource,
219
220
  "pinterest": PinterestSource,
220
221
  "revenuecat": RevenueCatSource,
222
+ "socrata": SocrataSource,
221
223
  "zoom": ZoomSource,
222
224
  "clickup": ClickupSource,
223
225
  "influxdb": InfluxDBSource,
@@ -0,0 +1,83 @@
1
+ """A source loading data from Socrata open data platform"""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ import dlt
6
+
7
+ from .helpers import fetch_data
8
+
9
+
10
+ @dlt.source(name="socrata", max_table_nesting=0)
11
+ def source(
12
+ domain: str,
13
+ dataset_id: str,
14
+ app_token: Optional[str] = None,
15
+ username: Optional[str] = None,
16
+ password: Optional[str] = None,
17
+ incremental: Optional[Any] = None,
18
+ primary_key: Optional[str] = None,
19
+ write_disposition: Optional[str] = dlt.config.value,
20
+ ):
21
+ """
22
+ A dlt source for the Socrata open data platform.
23
+
24
+ Supports both full refresh (replace) and incremental loading (merge).
25
+
26
+ Args:
27
+ domain: The Socrata domain (e.g., "evergreen.data.socrata.com")
28
+ dataset_id: The dataset identifier (e.g., "6udu-fhnu")
29
+ app_token: Socrata app token for higher rate limits (recommended)
30
+ username: Username for authentication (if dataset is private)
31
+ password: Password for authentication (if dataset is private)
32
+ incremental: DLT incremental object for incremental loading
33
+ primary_key: Primary key field for merge operations (default: ":id")
34
+ write_disposition: Write disposition ("replace", "append", "merge").
35
+ If not provided, automatically determined based on incremental setting.
36
+
37
+ Returns:
38
+ A dlt source with a single "dataset" resource
39
+ """
40
+
41
+ @dlt.resource(
42
+ write_disposition=write_disposition or "replace",
43
+ primary_key=primary_key, # type: ignore[call-overload]
44
+ )
45
+ def dataset(
46
+ incremental: Optional[dlt.sources.incremental] = incremental, # type: ignore[type-arg]
47
+ ) -> Iterator[Dict[str, Any]]:
48
+ """
49
+ Yields records from a Socrata dataset.
50
+
51
+ Supports both full refresh (replace) and incremental loading (merge).
52
+ When incremental is provided, filters data using SoQL WHERE clause on the server side.
53
+
54
+ Yields:
55
+ Dict[str, Any]: Individual records from the dataset
56
+ """
57
+ fetch_kwargs: Dict[str, Any] = {
58
+ "domain": domain,
59
+ "dataset_id": dataset_id,
60
+ "app_token": app_token,
61
+ "username": username,
62
+ "password": password,
63
+ }
64
+
65
+ if incremental and incremental.cursor_path:
66
+ fetch_kwargs["incremental_key"] = incremental.cursor_path
67
+ fetch_kwargs["start_value"] = (
68
+ str(incremental.last_value)
69
+ if incremental.last_value is not None
70
+ else None
71
+ )
72
+ if getattr(incremental, "end_value", None) is not None:
73
+ ev = incremental.end_value # type: ignore[attr-defined]
74
+ fetch_kwargs["end_value"] = (
75
+ ev.isoformat() # type: ignore[union-attr]
76
+ if hasattr(ev, "isoformat")
77
+ else str(ev)
78
+ )
79
+
80
+ # Fetch and yield records
81
+ yield from fetch_data(**fetch_kwargs)
82
+
83
+ return (dataset,)
@@ -0,0 +1,85 @@
1
+ """Socrata API helpers"""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ from dlt.sources.helpers import requests
6
+
7
+ from .settings import DEFAULT_PAGE_SIZE, REQUEST_TIMEOUT
8
+
9
+
10
+ def fetch_data(
11
+ domain: str,
12
+ dataset_id: str,
13
+ app_token: Optional[str] = None,
14
+ username: Optional[str] = None,
15
+ password: Optional[str] = None,
16
+ incremental_key: Optional[str] = None,
17
+ start_value: Optional[str] = None,
18
+ end_value: Optional[str] = None,
19
+ ) -> Iterator[Dict[str, Any]]:
20
+ """
21
+ Fetch records from Socrata dataset with pagination and optional filtering.
22
+
23
+ Uses offset-based pagination to get all records, not just first 50000.
24
+ Supports incremental loading via SoQL WHERE clause for server-side filtering.
25
+
26
+ Args:
27
+ domain: Socrata domain (e.g., "data.seattle.gov")
28
+ dataset_id: Dataset identifier (e.g., "6udu-fhnu")
29
+ app_token: Socrata app token for higher rate limits
30
+ username: Username for authentication
31
+ password: Password for authentication
32
+ start_value: Minimum value for incremental_key (inclusive)
33
+ end_value: Maximum value for incremental_key (exclusive)
34
+
35
+ Yields:
36
+ Lists of records (one list per page)
37
+
38
+ Raises:
39
+ requests.HTTPError: If API request fails
40
+ """
41
+ url = f"https://{domain}/resource/{dataset_id}.json"
42
+
43
+ headers = {"Accept": "application/json"}
44
+ if app_token:
45
+ headers["X-App-Token"] = app_token
46
+
47
+ auth = (username, password) if username and password else None
48
+
49
+ limit = DEFAULT_PAGE_SIZE
50
+ offset = 0
51
+
52
+ while True:
53
+ params: Dict[str, Any] = {"$limit": limit, "$offset": offset}
54
+
55
+ if incremental_key and start_value:
56
+ start_value_iso = str(start_value).replace(" ", "T")
57
+ where_conditions = [f"{incremental_key} >= '{start_value_iso}'"]
58
+
59
+ if end_value:
60
+ end_value_iso = str(end_value).replace(" ", "T")
61
+ where_conditions.append(f"{incremental_key} < '{end_value_iso}'")
62
+
63
+ params["$where"] = " AND ".join(where_conditions)
64
+ params["$order"] = f"{incremental_key} ASC"
65
+
66
+ response = requests.get(
67
+ url,
68
+ headers=headers,
69
+ auth=auth,
70
+ params=params,
71
+ timeout=REQUEST_TIMEOUT,
72
+ )
73
+ response.raise_for_status()
74
+
75
+ data = response.json()
76
+
77
+ if not data:
78
+ break
79
+
80
+ yield data
81
+
82
+ if len(data) < limit:
83
+ break
84
+
85
+ offset += limit
@@ -0,0 +1,8 @@
1
+ """Socrata API settings and constants"""
2
+
3
+ # Request timeout in seconds
4
+ REQUEST_TIMEOUT = 30
5
+
6
+ # Maximum number of records to fetch per page
7
+ # Socrata API supports up to 50000 records per request
8
+ DEFAULT_PAGE_SIZE = 50000
ingestr/src/sources.py CHANGED
@@ -4233,3 +4233,93 @@ class CouchbaseSource:
4233
4233
  table_instance.max_table_nesting = 1
4234
4234
 
4235
4235
  return table_instance
4236
+
4237
+
4238
+ class SocrataSource:
4239
+ def handles_incrementality(self) -> bool:
4240
+ return False
4241
+
4242
+ def dlt_source(self, uri: str, table: str, **kwargs):
4243
+ """
4244
+ Creates a DLT source for Socrata open data platform.
4245
+
4246
+ URI format: socrata://domain?app_token=TOKEN
4247
+ Table: dataset_id (e.g., "6udu-fhnu")
4248
+
4249
+ Args:
4250
+ uri: Socrata connection URI with domain and optional auth params
4251
+ table: Dataset ID (e.g., "6udu-fhnu")
4252
+ **kwargs: Additional arguments:
4253
+ - incremental_key: Field to use for incremental loading (e.g., ":updated_at")
4254
+ - interval_start: Start date for initial load
4255
+ - interval_end: End date for load
4256
+ - primary_key: Primary key field for merge operations
4257
+
4258
+ Returns:
4259
+ DltResource for the Socrata dataset
4260
+ """
4261
+ from urllib.parse import parse_qs, urlparse
4262
+
4263
+ parsed = urlparse(uri)
4264
+
4265
+ domain = parsed.netloc
4266
+ if not domain:
4267
+ raise ValueError(
4268
+ "Domain must be provided in the URI.\n"
4269
+ "Format: socrata://domain?app_token=TOKEN\n"
4270
+ "Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
4271
+ )
4272
+
4273
+ query_params = parse_qs(parsed.query)
4274
+
4275
+ dataset_id = table
4276
+ if not dataset_id:
4277
+ raise ValueError(
4278
+ "Dataset ID must be provided as the table parameter.\n"
4279
+ "Example: --source-table 6udu-fhnu"
4280
+ )
4281
+
4282
+ app_token = query_params.get("app_token", [None])[0]
4283
+ username = query_params.get("username", [None])[0]
4284
+ password = query_params.get("password", [None])[0]
4285
+
4286
+ incremental = None
4287
+ if kwargs.get("incremental_key"):
4288
+ start_value = kwargs.get("interval_start")
4289
+ end_value = kwargs.get("interval_end")
4290
+
4291
+ if start_value:
4292
+ start_value = (
4293
+ start_value.isoformat()
4294
+ if hasattr(start_value, "isoformat")
4295
+ else str(start_value)
4296
+ )
4297
+
4298
+ if end_value:
4299
+ end_value = (
4300
+ end_value.isoformat()
4301
+ if hasattr(end_value, "isoformat")
4302
+ else str(end_value)
4303
+ )
4304
+
4305
+ incremental = dlt_incremental(
4306
+ kwargs.get("incremental_key", ""),
4307
+ initial_value=start_value,
4308
+ end_value=end_value,
4309
+ range_end="open",
4310
+ range_start="closed",
4311
+ )
4312
+
4313
+ primary_key = kwargs.get("primary_key")
4314
+
4315
+ from ingestr.src.socrata_source import source
4316
+
4317
+ return source(
4318
+ domain=domain,
4319
+ dataset_id=dataset_id,
4320
+ app_token=app_token,
4321
+ username=username,
4322
+ password=password,
4323
+ incremental=incremental,
4324
+ primary_key=primary_key,
4325
+ ).with_resources("dataset")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.14.94
3
+ Version: 0.14.96
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -2,17 +2,17 @@ ingestr/conftest.py,sha256=OE2yxeTCosS9CUFVuqNypm-2ftYvVBeeq7egm3878cI,1981
2
2
  ingestr/main.py,sha256=qo0g3wCFl8a_1jUwXagX8L1Q8PKKQlTF7md9pfnzW0Y,27155
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
5
- ingestr/src/buildinfo.py,sha256=CcaXu7ayEOiHaFuTFCgGdFP5zqt7HZWvn7bvO_OyRhk,21
5
+ ingestr/src/buildinfo.py,sha256=-9qPR_WQg9aaTRg324DJAZs43V_FQHsRu9G9xDfXrjE,21
6
6
  ingestr/src/destinations.py,sha256=QtjE0AGs0WkPHaI2snWPHJ8HHi4lwXUBYLJPklz8Mvk,27772
7
7
  ingestr/src/errors.py,sha256=fhJ2BxOqOsBfOxuTDKfZblvawBrPG3x_1VikIxMZBRI,874
8
- ingestr/src/factory.py,sha256=WBc2y5N-9HH3WOTUSe7sDEftarScDAipka7CSW-A4L4,7829
8
+ ingestr/src/factory.py,sha256=iFOFbwifvQf7qOtSoNPS6RGvAhsRaX7HzbjouHmSvfs,7882
9
9
  ingestr/src/filters.py,sha256=0n0sNAVG_f-B_1r7lW5iNtw9z_G1bxWzPaiL1i6tnbU,1665
10
10
  ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
11
11
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
12
12
  ingestr/src/masking.py,sha256=VN0LdfvExhQ1bZMRylGtaBUIoH-vjuIUmRnYKwo3yiY,11358
13
13
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
14
14
  ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
15
- ingestr/src/sources.py,sha256=XEimy9ph8QDW8vjNocjovxM6cdrUkIPmFGZ9eLXZWVk,148842
15
+ ingestr/src/sources.py,sha256=JVZf22XgIFXov3-yKOjsbQVw9cV_LrDeXD6eb4Z6jFk,151802
16
16
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
17
17
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
18
18
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -154,6 +154,9 @@ ingestr/src/slack/__init__.py,sha256=pyDukxcilqTAe_bBzfWJ8Vxi83S-XEdEFBH2pEgILrM
154
154
  ingestr/src/slack/helpers.py,sha256=08TLK7vhFvH_uekdLVOLF3bTDe1zgH0QxHObXHzk1a8,6545
155
155
  ingestr/src/slack/settings.py,sha256=NhKn4y1zokEa5EmIZ05wtj_-I0GOASXZ5V81M1zXCtY,457
156
156
  ingestr/src/smartsheets/__init__.py,sha256=RIEfN1T2TMFg8T0RvN4o6sqC58YusJRDrmE9Isos5P4,2375
157
+ ingestr/src/socrata_source/__init__.py,sha256=K5DVpsVXTMfunZd5YoEsn1nipfo1zavFS59g3m2tsc8,2984
158
+ ingestr/src/socrata_source/helpers.py,sha256=KbVojFSmMLXb0ajh8bhqfZfxDHH7rQ3nyI8p2jxVifA,2500
159
+ ingestr/src/socrata_source/settings.py,sha256=DLfu-4HOa5nR7h9tbOySEa2ye3w_Z6TYZ9_zPqWaNQk,220
157
160
  ingestr/src/solidgate/__init__.py,sha256=Ts83j-JSnFsFuF4tDhVOfZKg7H0-bIpfn3kg1ZOR58A,8003
158
161
  ingestr/src/solidgate/helpers.py,sha256=mAsW_1hpD7ab3Y2vw8fxHi4yD3aT1geLdIYZ7ycyxBc,5690
159
162
  ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -186,8 +189,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
186
189
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
187
190
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
188
191
  ingestr/tests/unit/test_smartsheets.py,sha256=zf3DXT29Y4TH2lNPBFphdjlaelUUyPJcsW2UO68RzDs,4862
189
- ingestr-0.14.94.dist-info/METADATA,sha256=B9NePP8xKNVs7YZ6hZV6gDfTGBxdRwxs7aMd5xPL6As,15359
190
- ingestr-0.14.94.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
191
- ingestr-0.14.94.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
192
- ingestr-0.14.94.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
193
- ingestr-0.14.94.dist-info/RECORD,,
192
+ ingestr-0.14.96.dist-info/METADATA,sha256=vnkdaQVPvlnpHq9UgecuzRSSb_IiKE6_gS1jLkYzGEY,15359
193
+ ingestr-0.14.96.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
194
+ ingestr-0.14.96.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
195
+ ingestr-0.14.96.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
196
+ ingestr-0.14.96.dist-info/RECORD,,