ingestr 0.14.93__py3-none-any.whl → 0.14.96__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.14.93"
1
+ version = "v0.14.96"
@@ -0,0 +1,118 @@
1
+ """Source that loads data from Couchbase buckets, supports incremental loads."""
2
+
3
+ from typing import Optional
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+
8
+ from .helpers import (
9
+ CouchbaseConfiguration,
10
+ client_from_credentials,
11
+ fetch_documents,
12
+ )
13
+
14
+
15
+ @dlt.source(max_table_nesting=0)
16
+ def couchbase_source(
17
+ connection_string: str = dlt.secrets.value,
18
+ username: str = dlt.secrets.value,
19
+ password: str = dlt.secrets.value,
20
+ bucket: str = dlt.config.value,
21
+ scope: Optional[str] = dlt.config.value,
22
+ collection: Optional[str] = dlt.config.value,
23
+ incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
24
+ write_disposition: Optional[str] = dlt.config.value,
25
+ limit: Optional[int] = None,
26
+ ) -> DltResource:
27
+ """
28
+ A DLT source which loads data from a Couchbase bucket using Couchbase Python SDK.
29
+
30
+ Args:
31
+ connection_string (str): Couchbase connection string (e.g., 'couchbase://localhost')
32
+ username (str): Couchbase username
33
+ password (str): Couchbase password
34
+ bucket (str): Bucket name to load data from
35
+ scope (Optional[str]): Scope name (defaults to '_default')
36
+ collection (Optional[str]): Collection name (defaults to '_default')
37
+ incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading.
38
+ E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
39
+ write_disposition (str): Write disposition of the resource.
40
+ limit (Optional[int]): The maximum number of documents to load.
41
+
42
+ Returns:
43
+ DltResource: A DLT resource for the Couchbase collection.
44
+ """
45
+ # Set up Couchbase client
46
+ cluster = client_from_credentials(connection_string, username, password)
47
+
48
+ resource_name = f"{bucket}_{scope}_{collection}"
49
+
50
+ return dlt.resource( # type: ignore[call-overload, arg-type]
51
+ fetch_documents,
52
+ name=resource_name,
53
+ primary_key="id",
54
+ write_disposition=write_disposition or "replace",
55
+ spec=CouchbaseConfiguration,
56
+ max_table_nesting=0,
57
+ )(
58
+ cluster=cluster,
59
+ bucket_name=bucket,
60
+ scope_name=scope,
61
+ collection_name=collection,
62
+ incremental=incremental,
63
+ limit=limit,
64
+ )
65
+
66
+
67
+ @dlt.resource(
68
+ name=lambda args: f"{args['bucket']}_{args['scope']}_{args['collection']}",
69
+ standalone=True,
70
+ spec=CouchbaseConfiguration, # type: ignore[arg-type]
71
+ )
72
+ def couchbase_collection(
73
+ connection_string: str = dlt.secrets.value,
74
+ username: str = dlt.secrets.value,
75
+ password: str = dlt.secrets.value,
76
+ bucket: str = dlt.config.value,
77
+ scope: Optional[str] = dlt.config.value,
78
+ collection: Optional[str] = dlt.config.value,
79
+ incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
80
+ write_disposition: Optional[str] = dlt.config.value,
81
+ limit: Optional[int] = None,
82
+ chunk_size: Optional[int] = 1000,
83
+ ) -> DltResource:
84
+ """
85
+ A DLT resource which loads a collection from Couchbase.
86
+
87
+ Args:
88
+ connection_string (str): Couchbase connection string (e.g., 'couchbase://localhost')
89
+ username (str): Couchbase username
90
+ password (str): Couchbase password
91
+ bucket (str): Bucket name to load data from
92
+ scope (Optional[str]): Scope name (defaults to '_default')
93
+ collection (Optional[str]): Collection name (defaults to '_default')
94
+ incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading.
95
+ write_disposition (str): Write disposition of the resource.
96
+ limit (Optional[int]): The maximum number of documents to load.
97
+ chunk_size (Optional[int]): The number of documents to load in each batch.
98
+
99
+ Returns:
100
+ DltResource: A DLT resource for the Couchbase collection.
101
+ """
102
+ # Set up Couchbase client
103
+ cluster = client_from_credentials(connection_string, username, password)
104
+
105
+ return dlt.resource( # type: ignore[call-overload]
106
+ fetch_documents,
107
+ name=f"{bucket}_{scope}_{collection}",
108
+ primary_key="id",
109
+ write_disposition=write_disposition or "replace",
110
+ )(
111
+ cluster=cluster,
112
+ bucket_name=bucket,
113
+ scope_name=scope,
114
+ collection_name=collection,
115
+ incremental=incremental,
116
+ limit=limit,
117
+ chunk_size=chunk_size,
118
+ )
@@ -0,0 +1,135 @@
1
+ """Helper functions for Couchbase source."""
2
+
3
+ from datetime import datetime, timedelta
4
+ from typing import Any, Dict, Iterator, Optional
5
+
6
+ import dlt
7
+ from couchbase.auth import PasswordAuthenticator # type: ignore[import-untyped]
8
+ from couchbase.cluster import Cluster # type: ignore[import-untyped]
9
+ from couchbase.options import ( # type: ignore[import-untyped]
10
+ ClusterOptions,
11
+ QueryOptions,
12
+ )
13
+ from dlt.common.configuration import configspec
14
+ from dlt.common.time import ensure_pendulum_datetime
15
+
16
+
17
+ @configspec
18
+ class CouchbaseConfiguration:
19
+ """Configuration for Couchbase source."""
20
+
21
+ connection_string: str = dlt.secrets.value
22
+ username: str = dlt.secrets.value
23
+ password: str = dlt.secrets.value
24
+ bucket: str = dlt.config.value
25
+ scope: Optional[str] = dlt.config.value
26
+ collection: Optional[str] = dlt.config.value
27
+
28
+
29
+ def client_from_credentials(
30
+ connection_string: str, username: str, password: str
31
+ ) -> Cluster:
32
+ """
33
+ Create a Couchbase cluster client from credentials.
34
+
35
+ Args:
36
+ connection_string: Couchbase connection string
37
+ - Local/self-hosted: 'couchbase://localhost'
38
+ - Capella (cloud): 'couchbases://your-instance.cloud.couchbase.com'
39
+ username: Couchbase username
40
+ password: Couchbase password
41
+
42
+ Returns:
43
+ Cluster: Connected Couchbase cluster instance
44
+ """
45
+ auth = PasswordAuthenticator(username, password)
46
+ options = ClusterOptions(auth)
47
+
48
+ # Apply wan_development profile for Capella (couchbases://) connections
49
+ # This helps avoid latency issues when accessing from different networks
50
+ if connection_string.startswith("couchbases://"):
51
+ options.apply_profile("wan_development")
52
+
53
+ cluster = Cluster(connection_string, options)
54
+ cluster.wait_until_ready(timedelta(seconds=30))
55
+
56
+ return cluster
57
+
58
+
59
+ def fetch_documents(
60
+ cluster: Cluster,
61
+ bucket_name: str,
62
+ scope_name: str,
63
+ collection_name: str,
64
+ incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
65
+ limit: Optional[int] = None,
66
+ chunk_size: Optional[int] = 1000,
67
+ ) -> Iterator[Dict[str, Any]]:
68
+ """
69
+ Fetch documents from a Couchbase collection using N1QL queries.
70
+
71
+ Args:
72
+ cluster: Couchbase cluster instance
73
+ bucket_name: Name of the bucket
74
+ scope_name: Name of the scope
75
+ collection_name: Name of the collection
76
+ incremental: Incremental loading configuration
77
+ limit: Maximum number of documents to fetch
78
+ chunk_size: Number of documents to fetch per batch
79
+
80
+ Yields:
81
+ Dict[str, Any]: Document data
82
+ """
83
+ # Build N1QL query with full path
84
+ full_collection_path = f"`{bucket_name}`.`{scope_name}`.`{collection_name}`"
85
+ n1ql_query = f"SELECT META().id as id, c.* FROM {full_collection_path} c"
86
+
87
+ # Add incremental filter if provided
88
+ if incremental and incremental.cursor_path:
89
+ where_clause = f" WHERE {incremental.cursor_path} >= $start_value"
90
+ if incremental.end_value is not None:
91
+ where_clause += f" AND {incremental.cursor_path} < $end_value"
92
+ n1ql_query += where_clause
93
+
94
+ # Add limit if provided
95
+ if limit:
96
+ n1ql_query += f" LIMIT {limit}"
97
+
98
+ # Execute query
99
+ try:
100
+ query_options = QueryOptions()
101
+
102
+ # Add parameters if incremental
103
+ if incremental and incremental.cursor_path:
104
+ named_parameters = {"start_value": incremental.last_value}
105
+ if incremental.end_value is not None:
106
+ named_parameters["end_value"] = incremental.end_value
107
+ query_options = QueryOptions(named_parameters=named_parameters)
108
+
109
+ result = cluster.query(n1ql_query, query_options)
110
+
111
+ # Yield documents
112
+ count = 0
113
+ for row in result:
114
+ doc = dict(row)
115
+
116
+ # Convert datetime fields to proper format
117
+ if (
118
+ incremental
119
+ and incremental.cursor_path
120
+ and incremental.cursor_path in doc
121
+ ):
122
+ cursor_value = doc[incremental.cursor_path]
123
+ if isinstance(cursor_value, (str, datetime)):
124
+ doc[incremental.cursor_path] = ensure_pendulum_datetime(
125
+ cursor_value
126
+ )
127
+
128
+ yield doc
129
+
130
+ count += 1
131
+ if limit and count >= limit:
132
+ break
133
+
134
+ except Exception as e:
135
+ raise Exception(f"Error executing Couchbase N1QL query: {str(e)}")
ingestr/src/factory.py CHANGED
@@ -39,6 +39,7 @@ from ingestr.src.sources import (
39
39
  AttioSource,
40
40
  ChessSource,
41
41
  ClickupSource,
42
+ CouchbaseSource,
42
43
  DoceboSource,
43
44
  DynamoDBSource,
44
45
  ElasticsearchSource,
@@ -83,6 +84,7 @@ from ingestr.src.sources import (
83
84
  ShopifySource,
84
85
  SlackSource,
85
86
  SmartsheetSource,
87
+ SocrataSource,
86
88
  SolidgateSource,
87
89
  SqlSource,
88
90
  StripeAnalyticsSource,
@@ -160,6 +162,7 @@ class SourceDestinationFactory:
160
162
  "allium": AlliumSource,
161
163
  "anthropic": AnthropicSource,
162
164
  "csv": LocalCsvSource,
165
+ "couchbase": CouchbaseSource,
163
166
  "docebo": DoceboSource,
164
167
  "http": HttpSource,
165
168
  "https": HttpSource,
@@ -216,6 +219,7 @@ class SourceDestinationFactory:
216
219
  "sftp": SFTPSource,
217
220
  "pinterest": PinterestSource,
218
221
  "revenuecat": RevenueCatSource,
222
+ "socrata": SocrataSource,
219
223
  "zoom": ZoomSource,
220
224
  "clickup": ClickupSource,
221
225
  "influxdb": InfluxDBSource,
@@ -1,4 +1,3 @@
1
- import asyncio
2
1
  from typing import Any, Dict, Iterable, Iterator
3
2
 
4
3
  import aiohttp
@@ -40,51 +39,26 @@ def revenuecat_source(
40
39
  yield project
41
40
 
42
41
  @dlt.resource(
43
- name="customers", primary_key="id", write_disposition="merge", parallelized=True
42
+ name="customer_ids",
43
+ write_disposition="replace",
44
+ selected=False,
45
+ parallelized=True,
44
46
  )
45
- def customers() -> Iterator[Dict[str, Any]]:
46
- """Get list of customers with nested purchases and subscriptions."""
47
+ def customer_ids():
47
48
  if project_id is None:
48
49
  raise ValueError("project_id is required for customers resource")
49
- endpoint = f"/projects/{project_id}/customers"
50
50
 
51
- async def process_customer_batch(customer_batch):
52
- """Process a batch of customers with async operations."""
53
- async with aiohttp.ClientSession() as session:
54
- tasks = []
55
- for customer in customer_batch:
56
- task = process_customer_with_nested_resources_async(
57
- session, api_key, project_id, customer
58
- )
59
- tasks.append(task)
51
+ yield _paginate(api_key, f"/projects/{project_id}/customers")
60
52
 
61
- return await asyncio.gather(*tasks)
62
-
63
- def process_customers_sync():
64
- """Process customers in batches using asyncio."""
65
- batch_size = 50 # Conservative batch size due to 60 req/min rate limit
66
- current_batch = []
67
-
68
- for customer in _paginate(api_key, endpoint):
69
- current_batch.append(customer)
70
-
71
- if len(current_batch) >= batch_size:
72
- # Process the batch asynchronously
73
- processed_customers = asyncio.run(
74
- process_customer_batch(current_batch)
75
- )
76
- for processed_customer in processed_customers:
77
- yield processed_customer
78
- current_batch = []
79
-
80
- # Process any remaining customers in the final batch
81
- if current_batch:
82
- processed_customers = asyncio.run(process_customer_batch(current_batch))
83
- for processed_customer in processed_customers:
84
- yield processed_customer
85
-
86
- # Yield each processed customer
87
- yield from process_customers_sync()
53
+ @dlt.transformer(
54
+ data_from=customer_ids, write_disposition="replace", parallelized=True
55
+ )
56
+ async def customers(customers) -> Iterator[Dict[str, Any]]:
57
+ async with aiohttp.ClientSession() as session:
58
+ for customer in customers:
59
+ yield await process_customer_with_nested_resources_async(
60
+ session, api_key, project_id, customer
61
+ )
88
62
 
89
63
  # Create project-dependent resources dynamically
90
64
  project_resources = []
@@ -103,6 +77,7 @@ def revenuecat_source(
103
77
 
104
78
  return [
105
79
  projects,
80
+ customer_ids,
106
81
  customers,
107
82
  *project_resources,
108
83
  ]
@@ -64,12 +64,9 @@ def _paginate(
64
64
  while True:
65
65
  data = _make_request(api_key, endpoint, current_params)
66
66
 
67
- # Yield items from the current page
68
67
  if "items" in data and data["items"] is not None:
69
- for item in data["items"]:
70
- yield item
68
+ yield data["items"]
71
69
 
72
- # Check if there's a next page
73
70
  if "next_page" not in data:
74
71
  break
75
72
 
@@ -88,7 +85,6 @@ def convert_timestamps_to_iso(
88
85
  """Convert timestamp fields from milliseconds to ISO format."""
89
86
  for field in timestamp_fields:
90
87
  if field in record and record[field] is not None:
91
- # Convert from milliseconds timestamp to ISO datetime string
92
88
  timestamp_ms = record[field]
93
89
  dt = pendulum.from_timestamp(timestamp_ms / 1000)
94
90
  record[field] = dt.to_iso8601_string()
@@ -177,87 +173,37 @@ async def _paginate_async(
177
173
  return items
178
174
 
179
175
 
180
- async def fetch_and_process_nested_resource_async(
181
- session: aiohttp.ClientSession,
182
- api_key: str,
183
- project_id: str,
184
- customer_id: str,
185
- customer: Dict[str, Any],
186
- resource_name: str,
187
- timestamp_fields: Optional[List[str]] = None,
188
- ) -> None:
189
- """
190
- Fetch and process any nested resource for a customer asynchronously.
191
-
192
- Args:
193
- session: aiohttp ClientSession
194
- api_key: RevenueCat API key
195
- project_id: Project ID
196
- customer_id: Customer ID
197
- customer: Customer data dictionary to modify
198
- resource_name: Name of the nested resource (e.g., 'purchases', 'subscriptions', 'events')
199
- timestamp_fields: List of timestamp fields to convert to ISO format
200
- """
201
- # If resource not included in customer data, fetch separately
202
- if resource_name not in customer or customer[resource_name] is None:
203
- endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
204
- customer[resource_name] = await _paginate_async(session, api_key, endpoint)
205
-
206
- # Convert timestamps if fields specified
207
- if (
208
- timestamp_fields
209
- and resource_name in customer
210
- and customer[resource_name] is not None
211
- ):
212
- for item in customer[resource_name]:
213
- convert_timestamps_to_iso(item, timestamp_fields)
214
-
215
-
216
176
  async def process_customer_with_nested_resources_async(
217
177
  session: aiohttp.ClientSession,
218
178
  api_key: str,
219
179
  project_id: str,
220
180
  customer: Dict[str, Any],
221
181
  ) -> Dict[str, Any]:
222
- """
223
- Process a customer and fetch nested resources concurrently.
224
-
225
- Args:
226
- session: aiohttp ClientSession
227
- api_key: RevenueCat API key
228
- project_id: Project ID
229
- customer: Customer data to process
230
-
231
- Returns:
232
- Customer data with nested resources populated
233
- """
234
182
  customer_id = customer["id"]
235
-
236
- # Convert customer timestamps
237
183
  customer = convert_timestamps_to_iso(customer, ["first_seen_at", "last_seen_at"])
238
-
239
- # Define nested resources to fetch concurrently
240
184
  nested_resources = [
241
185
  ("subscriptions", ["purchased_at", "expires_at", "grace_period_expires_at"]),
242
186
  ("purchases", ["purchased_at", "expires_at"]),
243
187
  ]
244
188
 
245
- # Create concurrent tasks for fetching nested resources
246
- tasks = []
247
- for resource_name, timestamp_fields in nested_resources:
248
- task = fetch_and_process_nested_resource_async(
249
- session,
250
- api_key,
251
- project_id,
252
- customer_id,
253
- customer,
254
- resource_name,
255
- timestamp_fields,
256
- )
257
- tasks.append(task)
258
-
259
- # Wait for all nested resources to be fetched
260
- await asyncio.gather(*tasks)
189
+ async def fetch_and_convert(resource_name, timestamp_fields):
190
+ if resource_name not in customer or customer[resource_name] is None:
191
+ endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
192
+ customer[resource_name] = await _paginate_async(session, api_key, endpoint)
193
+ if (
194
+ timestamp_fields
195
+ and resource_name in customer
196
+ and customer[resource_name] is not None
197
+ ):
198
+ for item in customer[resource_name]:
199
+ convert_timestamps_to_iso(item, timestamp_fields)
200
+
201
+ await asyncio.gather(
202
+ *[
203
+ fetch_and_convert(resource_name, timestamp_fields)
204
+ for resource_name, timestamp_fields in nested_resources
205
+ ]
206
+ )
261
207
 
262
208
  return customer
263
209
 
@@ -0,0 +1,83 @@
1
+ """A source loading data from Socrata open data platform"""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ import dlt
6
+
7
+ from .helpers import fetch_data
8
+
9
+
10
+ @dlt.source(name="socrata", max_table_nesting=0)
11
+ def source(
12
+ domain: str,
13
+ dataset_id: str,
14
+ app_token: Optional[str] = None,
15
+ username: Optional[str] = None,
16
+ password: Optional[str] = None,
17
+ incremental: Optional[Any] = None,
18
+ primary_key: Optional[str] = None,
19
+ write_disposition: Optional[str] = dlt.config.value,
20
+ ):
21
+ """
22
+ A dlt source for the Socrata open data platform.
23
+
24
+ Supports both full refresh (replace) and incremental loading (merge).
25
+
26
+ Args:
27
+ domain: The Socrata domain (e.g., "evergreen.data.socrata.com")
28
+ dataset_id: The dataset identifier (e.g., "6udu-fhnu")
29
+ app_token: Socrata app token for higher rate limits (recommended)
30
+ username: Username for authentication (if dataset is private)
31
+ password: Password for authentication (if dataset is private)
32
+ incremental: DLT incremental object for incremental loading
33
+ primary_key: Primary key field for merge operations (default: ":id")
34
+ write_disposition: Write disposition ("replace", "append", "merge").
35
+ If not provided, automatically determined based on incremental setting.
36
+
37
+ Returns:
38
+ A dlt source with a single "dataset" resource
39
+ """
40
+
41
+ @dlt.resource(
42
+ write_disposition=write_disposition or "replace",
43
+ primary_key=primary_key, # type: ignore[call-overload]
44
+ )
45
+ def dataset(
46
+ incremental: Optional[dlt.sources.incremental] = incremental, # type: ignore[type-arg]
47
+ ) -> Iterator[Dict[str, Any]]:
48
+ """
49
+ Yields records from a Socrata dataset.
50
+
51
+ Supports both full refresh (replace) and incremental loading (merge).
52
+ When incremental is provided, filters data using SoQL WHERE clause on the server side.
53
+
54
+ Yields:
55
+ Dict[str, Any]: Individual records from the dataset
56
+ """
57
+ fetch_kwargs: Dict[str, Any] = {
58
+ "domain": domain,
59
+ "dataset_id": dataset_id,
60
+ "app_token": app_token,
61
+ "username": username,
62
+ "password": password,
63
+ }
64
+
65
+ if incremental and incremental.cursor_path:
66
+ fetch_kwargs["incremental_key"] = incremental.cursor_path
67
+ fetch_kwargs["start_value"] = (
68
+ str(incremental.last_value)
69
+ if incremental.last_value is not None
70
+ else None
71
+ )
72
+ if getattr(incremental, "end_value", None) is not None:
73
+ ev = incremental.end_value # type: ignore[attr-defined]
74
+ fetch_kwargs["end_value"] = (
75
+ ev.isoformat() # type: ignore[union-attr]
76
+ if hasattr(ev, "isoformat")
77
+ else str(ev)
78
+ )
79
+
80
+ # Fetch and yield records
81
+ yield from fetch_data(**fetch_kwargs)
82
+
83
+ return (dataset,)
@@ -0,0 +1,85 @@
1
+ """Socrata API helpers"""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ from dlt.sources.helpers import requests
6
+
7
+ from .settings import DEFAULT_PAGE_SIZE, REQUEST_TIMEOUT
8
+
9
+
10
+ def fetch_data(
11
+ domain: str,
12
+ dataset_id: str,
13
+ app_token: Optional[str] = None,
14
+ username: Optional[str] = None,
15
+ password: Optional[str] = None,
16
+ incremental_key: Optional[str] = None,
17
+ start_value: Optional[str] = None,
18
+ end_value: Optional[str] = None,
19
+ ) -> Iterator[Dict[str, Any]]:
20
+ """
21
+ Fetch records from Socrata dataset with pagination and optional filtering.
22
+
23
+ Uses offset-based pagination to get all records, not just first 50000.
24
+ Supports incremental loading via SoQL WHERE clause for server-side filtering.
25
+
26
+ Args:
27
+ domain: Socrata domain (e.g., "data.seattle.gov")
28
+ dataset_id: Dataset identifier (e.g., "6udu-fhnu")
29
+ app_token: Socrata app token for higher rate limits
30
+ username: Username for authentication
31
+ password: Password for authentication
32
+ start_value: Minimum value for incremental_key (inclusive)
33
+ end_value: Maximum value for incremental_key (exclusive)
34
+
35
+ Yields:
36
+ Lists of records (one list per page)
37
+
38
+ Raises:
39
+ requests.HTTPError: If API request fails
40
+ """
41
+ url = f"https://{domain}/resource/{dataset_id}.json"
42
+
43
+ headers = {"Accept": "application/json"}
44
+ if app_token:
45
+ headers["X-App-Token"] = app_token
46
+
47
+ auth = (username, password) if username and password else None
48
+
49
+ limit = DEFAULT_PAGE_SIZE
50
+ offset = 0
51
+
52
+ while True:
53
+ params: Dict[str, Any] = {"$limit": limit, "$offset": offset}
54
+
55
+ if incremental_key and start_value:
56
+ start_value_iso = str(start_value).replace(" ", "T")
57
+ where_conditions = [f"{incremental_key} >= '{start_value_iso}'"]
58
+
59
+ if end_value:
60
+ end_value_iso = str(end_value).replace(" ", "T")
61
+ where_conditions.append(f"{incremental_key} < '{end_value_iso}'")
62
+
63
+ params["$where"] = " AND ".join(where_conditions)
64
+ params["$order"] = f"{incremental_key} ASC"
65
+
66
+ response = requests.get(
67
+ url,
68
+ headers=headers,
69
+ auth=auth,
70
+ params=params,
71
+ timeout=REQUEST_TIMEOUT,
72
+ )
73
+ response.raise_for_status()
74
+
75
+ data = response.json()
76
+
77
+ if not data:
78
+ break
79
+
80
+ yield data
81
+
82
+ if len(data) < limit:
83
+ break
84
+
85
+ offset += limit
@@ -0,0 +1,8 @@
1
+ """Socrata API settings and constants"""
2
+
3
+ # Request timeout in seconds
4
+ REQUEST_TIMEOUT = 30
5
+
6
+ # Maximum number of records to fetch per page
7
+ # Socrata API supports up to 50000 records per request
8
+ DEFAULT_PAGE_SIZE = 50000
ingestr/src/sources.py CHANGED
@@ -4066,3 +4066,260 @@ class AlliumSource:
4066
4066
  limit=limit,
4067
4067
  compute_profile=compute_profile,
4068
4068
  )
4069
+
4070
+
4071
+ class CouchbaseSource:
4072
+ table_builder: Callable
4073
+
4074
+ def __init__(self, table_builder=None) -> None:
4075
+ if table_builder is None:
4076
+ from ingestr.src.couchbase_source import couchbase_collection
4077
+
4078
+ table_builder = couchbase_collection
4079
+
4080
+ self.table_builder = table_builder
4081
+
4082
+ def handles_incrementality(self) -> bool:
4083
+ return False
4084
+
4085
+ def dlt_source(self, uri: str, table: str, **kwargs):
4086
+ """
4087
+ Create a dlt source for reading data from Couchbase.
4088
+
4089
+ URI formats:
4090
+ - couchbase://username:password@host
4091
+ - couchbase://username:password@host/bucket
4092
+ - couchbase://username:password@host?ssl=true
4093
+ - couchbases://username:password@host (SSL enabled)
4094
+
4095
+ Table formats:
4096
+ - bucket.scope.collection (when bucket not in URI)
4097
+ - scope.collection (when bucket specified in URI path)
4098
+
4099
+ Note: If password contains special characters (@, :, /, etc.), they must be URL-encoded.
4100
+
4101
+ Examples:
4102
+ Local/Self-hosted:
4103
+ - couchbase://admin:password123@localhost with table "mybucket.myscope.mycollection"
4104
+ - couchbase://admin:password123@localhost/mybucket with table "myscope.mycollection"
4105
+ - couchbase://admin:password123@localhost?ssl=true with table "mybucket._default._default"
4106
+
4107
+ Capella (Cloud):
4108
+ - couchbases://user:pass@cb.xxx.cloud.couchbase.com with table "travel-sample.inventory.airport"
4109
+ - couchbase://user:pass@cb.xxx.cloud.couchbase.com/travel-sample?ssl=true with table "inventory.airport"
4110
+
4111
+ To encode password in Python:
4112
+ from urllib.parse import quote
4113
+ encoded_pwd = quote("MyPass@123!", safe='')
4114
+ uri = f"couchbase://admin:{encoded_pwd}@localhost?ssl=true"
4115
+
4116
+ Args:
4117
+ uri: Couchbase connection URI (can include /bucket path and ?ssl=true query parameter)
4118
+ table: Format depends on URI:
4119
+ - bucket.scope.collection (if bucket not in URI)
4120
+ - scope.collection (if bucket in URI path)
4121
+ **kwargs: Additional arguments:
4122
+ - limit: Maximum number of documents to fetch
4123
+ - incremental_key: Field to use for incremental loading
4124
+ - interval_start: Start value for incremental loading
4125
+ - interval_end: End value for incremental loading
4126
+
4127
+ Returns:
4128
+ DltResource for the Couchbase collection
4129
+ """
4130
+ # Parse the URI to extract connection details
4131
+ # urlparse automatically decodes URL-encoded credentials
4132
+
4133
+ parsed = urlparse(uri)
4134
+
4135
+ # Extract username and password from URI
4136
+ # Note: urlparse automatically decodes URL-encoded characters in username/password
4137
+ from urllib.parse import unquote
4138
+
4139
+ username = parsed.username
4140
+ password = unquote(parsed.password) if parsed.password else None
4141
+
4142
+ if not username or not password:
4143
+ raise ValueError(
4144
+ "Username and password must be provided in the URI.\n"
4145
+ "Format: couchbase://username:password@host\n"
4146
+ "If password has special characters (@, :, /), URL-encode them.\n"
4147
+ "Example: couchbase://admin:MyPass%40123@localhost for password 'MyPass@123'"
4148
+ )
4149
+
4150
+ # Reconstruct connection string without credentials
4151
+ scheme = parsed.scheme
4152
+ netloc = parsed.netloc
4153
+
4154
+ # Remove username:password@ from netloc if present
4155
+ if "@" in netloc:
4156
+ netloc = netloc.split("@", 1)[1]
4157
+
4158
+ # Parse query parameters from URI
4159
+ from urllib.parse import parse_qs
4160
+
4161
+ query_params = parse_qs(parsed.query)
4162
+
4163
+ # Check if SSL is requested via URI query parameter (?ssl=true)
4164
+ if "ssl" in query_params:
4165
+ ssl_value = query_params["ssl"][0].lower()
4166
+ use_ssl = ssl_value in ("true", "1", "yes")
4167
+
4168
+ # Apply SSL scheme based on parameter
4169
+ if use_ssl and scheme == "couchbase":
4170
+ scheme = "couchbases"
4171
+
4172
+ connection_string = f"{scheme}://{netloc}"
4173
+
4174
+ # Extract bucket from URI path if present (e.g., couchbase://host/bucket)
4175
+ bucket_from_uri = None
4176
+ if parsed.path and parsed.path.strip("/"):
4177
+ bucket_from_uri = parsed.path.strip("/").split("/")[0]
4178
+
4179
+ # Parse table format: can be "scope.collection" or "bucket.scope.collection"
4180
+ table_parts = table.split(".")
4181
+
4182
+ if len(table_parts) == 3:
4183
+ # Format: bucket.scope.collection
4184
+ bucket, scope, collection = table_parts
4185
+ elif len(table_parts) == 2:
4186
+ # Format: scope.collection (bucket from URI)
4187
+ if bucket_from_uri:
4188
+ bucket = bucket_from_uri
4189
+ scope, collection = table_parts
4190
+ else:
4191
+ raise ValueError(
4192
+ "Table format is 'scope.collection' but no bucket specified in URI.\n"
4193
+ f"Either use URI format: couchbase://user:pass@host/bucket\n"
4194
+ f"Or use table format: bucket.scope.collection\n"
4195
+ f"Got table: {table}"
4196
+ )
4197
+ else:
4198
+ raise ValueError(
4199
+ "Table format must be 'bucket.scope.collection' or 'scope.collection' (with bucket in URI). "
4200
+ f"Got: {table}\n"
4201
+ "Examples:\n"
4202
+ " - URI: couchbase://user:pass@host, Table: travel-sample.inventory.airport\n"
4203
+ " - URI: couchbase://user:pass@host/travel-sample, Table: inventory.airport"
4204
+ )
4205
+
4206
+ # Handle incremental loading
4207
+ incremental = None
4208
+ if kwargs.get("incremental_key"):
4209
+ start_value = kwargs.get("interval_start")
4210
+ end_value = kwargs.get("interval_end")
4211
+
4212
+ incremental = dlt_incremental(
4213
+ kwargs.get("incremental_key", ""),
4214
+ initial_value=start_value,
4215
+ end_value=end_value,
4216
+ range_end="closed",
4217
+ range_start="closed",
4218
+ )
4219
+
4220
+ # Get optional parameters
4221
+ limit = kwargs.get("limit")
4222
+
4223
+ table_instance = self.table_builder(
4224
+ connection_string=connection_string,
4225
+ username=username,
4226
+ password=password,
4227
+ bucket=bucket,
4228
+ scope=scope,
4229
+ collection=collection,
4230
+ incremental=incremental,
4231
+ limit=limit,
4232
+ )
4233
+ table_instance.max_table_nesting = 1
4234
+
4235
+ return table_instance
4236
+
4237
+
4238
+ class SocrataSource:
4239
+ def handles_incrementality(self) -> bool:
4240
+ return False
4241
+
4242
+ def dlt_source(self, uri: str, table: str, **kwargs):
4243
+ """
4244
+ Creates a DLT source for Socrata open data platform.
4245
+
4246
+ URI format: socrata://domain?app_token=TOKEN
4247
+ Table: dataset_id (e.g., "6udu-fhnu")
4248
+
4249
+ Args:
4250
+ uri: Socrata connection URI with domain and optional auth params
4251
+ table: Dataset ID (e.g., "6udu-fhnu")
4252
+ **kwargs: Additional arguments:
4253
+ - incremental_key: Field to use for incremental loading (e.g., ":updated_at")
4254
+ - interval_start: Start date for initial load
4255
+ - interval_end: End date for load
4256
+ - primary_key: Primary key field for merge operations
4257
+
4258
+ Returns:
4259
+ DltResource for the Socrata dataset
4260
+ """
4261
+ from urllib.parse import parse_qs, urlparse
4262
+
4263
+ parsed = urlparse(uri)
4264
+
4265
+ domain = parsed.netloc
4266
+ if not domain:
4267
+ raise ValueError(
4268
+ "Domain must be provided in the URI.\n"
4269
+ "Format: socrata://domain?app_token=TOKEN\n"
4270
+ "Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
4271
+ )
4272
+
4273
+ query_params = parse_qs(parsed.query)
4274
+
4275
+ dataset_id = table
4276
+ if not dataset_id:
4277
+ raise ValueError(
4278
+ "Dataset ID must be provided as the table parameter.\n"
4279
+ "Example: --source-table 6udu-fhnu"
4280
+ )
4281
+
4282
+ app_token = query_params.get("app_token", [None])[0]
4283
+ username = query_params.get("username", [None])[0]
4284
+ password = query_params.get("password", [None])[0]
4285
+
4286
+ incremental = None
4287
+ if kwargs.get("incremental_key"):
4288
+ start_value = kwargs.get("interval_start")
4289
+ end_value = kwargs.get("interval_end")
4290
+
4291
+ if start_value:
4292
+ start_value = (
4293
+ start_value.isoformat()
4294
+ if hasattr(start_value, "isoformat")
4295
+ else str(start_value)
4296
+ )
4297
+
4298
+ if end_value:
4299
+ end_value = (
4300
+ end_value.isoformat()
4301
+ if hasattr(end_value, "isoformat")
4302
+ else str(end_value)
4303
+ )
4304
+
4305
+ incremental = dlt_incremental(
4306
+ kwargs.get("incremental_key", ""),
4307
+ initial_value=start_value,
4308
+ end_value=end_value,
4309
+ range_end="open",
4310
+ range_start="closed",
4311
+ )
4312
+
4313
+ primary_key = kwargs.get("primary_key")
4314
+
4315
+ from ingestr.src.socrata_source import source
4316
+
4317
+ return source(
4318
+ domain=domain,
4319
+ dataset_id=dataset_id,
4320
+ app_token=app_token,
4321
+ username=username,
4322
+ password=password,
4323
+ incremental=incremental,
4324
+ primary_key=primary_key,
4325
+ ).with_resources("dataset")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.14.93
3
+ Version: 0.14.96
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -39,6 +39,7 @@ Requires-Dist: clickhouse-connect==0.8.14
39
39
  Requires-Dist: clickhouse-driver==0.2.9
40
40
  Requires-Dist: clickhouse-sqlalchemy==0.2.7
41
41
  Requires-Dist: confluent-kafka==2.8.0
42
+ Requires-Dist: couchbase==4.3.6
42
43
  Requires-Dist: crate==2.0.0
43
44
  Requires-Dist: cryptography==44.0.2
44
45
  Requires-Dist: curlify==2.2.1
@@ -2,17 +2,17 @@ ingestr/conftest.py,sha256=OE2yxeTCosS9CUFVuqNypm-2ftYvVBeeq7egm3878cI,1981
2
2
  ingestr/main.py,sha256=qo0g3wCFl8a_1jUwXagX8L1Q8PKKQlTF7md9pfnzW0Y,27155
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
5
- ingestr/src/buildinfo.py,sha256=gpczaxQtINGa_cWhMVJsfeFoUxh-gKIyba1YESpTmpk,21
5
+ ingestr/src/buildinfo.py,sha256=-9qPR_WQg9aaTRg324DJAZs43V_FQHsRu9G9xDfXrjE,21
6
6
  ingestr/src/destinations.py,sha256=QtjE0AGs0WkPHaI2snWPHJ8HHi4lwXUBYLJPklz8Mvk,27772
7
7
  ingestr/src/errors.py,sha256=fhJ2BxOqOsBfOxuTDKfZblvawBrPG3x_1VikIxMZBRI,874
8
- ingestr/src/factory.py,sha256=k_8jgehOM2sHwCsjliYXmQhICl2B1UYoAs6vspjadv8,7770
8
+ ingestr/src/factory.py,sha256=iFOFbwifvQf7qOtSoNPS6RGvAhsRaX7HzbjouHmSvfs,7882
9
9
  ingestr/src/filters.py,sha256=0n0sNAVG_f-B_1r7lW5iNtw9z_G1bxWzPaiL1i6tnbU,1665
10
10
  ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
11
11
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
12
12
  ingestr/src/masking.py,sha256=VN0LdfvExhQ1bZMRylGtaBUIoH-vjuIUmRnYKwo3yiY,11358
13
13
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
14
14
  ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
15
- ingestr/src/sources.py,sha256=D4VxA-yqilzTG0VBJBxnw9MUJ1Qeo2EpKjVGJfoMKoY,142289
15
+ ingestr/src/sources.py,sha256=JVZf22XgIFXov3-yKOjsbQVw9cV_LrDeXD6eb4Z6jFk,151802
16
16
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
17
17
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
18
18
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -43,6 +43,8 @@ ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k
43
43
  ingestr/src/clickup/__init__.py,sha256=uvfAqNturT4bMvU4NS3E8BdL6nvDFzNuh7bMlih8HJk,2547
44
44
  ingestr/src/clickup/helpers.py,sha256=RzDKMUAHccuDhocIQ2ToBXfCERo8CBJqA3t-IPltBCE,1519
45
45
  ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
46
+ ingestr/src/couchbase_source/__init__.py,sha256=IPmb55mBxGWtt_9ywbY6chAwUp6jRmJTu-qEVFBhJ_s,4381
47
+ ingestr/src/couchbase_source/helpers.py,sha256=RA0aFT0GfLJ2pHy7emvKmm0yVXgQOQ-hMVJvw-FExNo,4487
46
48
  ingestr/src/docebo/__init__.py,sha256=RBBjlt405PIIDOLEt78g9yBNJfhUMeJxR5DZD7oufXY,27543
47
49
  ingestr/src/docebo/client.py,sha256=nki0kNQhN8VDz5cdqlQQPhr1JMPlcNEYKnWK3umAyOc,15663
48
50
  ingestr/src/docebo/helpers.py,sha256=SaEjta6k3Lj-S5fvrheA5_xj7zfASMdOc_ihsqno5ko,3238
@@ -140,8 +142,8 @@ ingestr/src/plusvibeai/__init__.py,sha256=Uo-N2-1kbq5RJw8ym5tm8rqVchVbJJ2hOd6bws
140
142
  ingestr/src/plusvibeai/helpers.py,sha256=5hxxA2-XUtkZA1xrstZ39ilzUh4EouNDOiiL-NzGu9w,17939
141
143
  ingestr/src/plusvibeai/settings.py,sha256=3Hb7jcUNshSlGO4E27yUe_8n3f0VArX9XTmkTkN-Tvo,5366
142
144
  ingestr/src/quickbooks/__init__.py,sha256=cZUuVCOTGPHTscRj6i0DytO63_fWF-4ieMxoU4PcyTg,3727
143
- ingestr/src/revenuecat/__init__.py,sha256=5HbyZuEOekkbeeT72sM_bnGygSyYdmd_vczfAUz7xoM,4029
144
- ingestr/src/revenuecat/helpers.py,sha256=CYU6l79kplnfL87GfdxyGeEBrBSWEZfGP0GyjPHuVDk,9619
145
+ ingestr/src/revenuecat/__init__.py,sha256=j75jkHBqd_9FsFKjsSLLwKrPcmUKOE3HJ95Qzonzmbk,2779
146
+ ingestr/src/revenuecat/helpers.py,sha256=ej_bR6cuNOer4bTQfd_IuyMmt-xevcPgvRShKlxO8Xo,7998
145
147
  ingestr/src/salesforce/__init__.py,sha256=Ijveo8gyo_wLzQRBklxIm3RV0y2Gta9-mR44RbJljpI,4901
146
148
  ingestr/src/salesforce/helpers.py,sha256=QTdazBt-qRTBbCQMZnyclIaDQFmBixBy_RDKD00Lt-8,2492
147
149
  ingestr/src/shopify/__init__.py,sha256=RzSSG93g-Qlkz6TAxi1XasFDdxxtVXIo53ZTtjGczW4,62602
@@ -152,6 +154,9 @@ ingestr/src/slack/__init__.py,sha256=pyDukxcilqTAe_bBzfWJ8Vxi83S-XEdEFBH2pEgILrM
152
154
  ingestr/src/slack/helpers.py,sha256=08TLK7vhFvH_uekdLVOLF3bTDe1zgH0QxHObXHzk1a8,6545
153
155
  ingestr/src/slack/settings.py,sha256=NhKn4y1zokEa5EmIZ05wtj_-I0GOASXZ5V81M1zXCtY,457
154
156
  ingestr/src/smartsheets/__init__.py,sha256=RIEfN1T2TMFg8T0RvN4o6sqC58YusJRDrmE9Isos5P4,2375
157
+ ingestr/src/socrata_source/__init__.py,sha256=K5DVpsVXTMfunZd5YoEsn1nipfo1zavFS59g3m2tsc8,2984
158
+ ingestr/src/socrata_source/helpers.py,sha256=KbVojFSmMLXb0ajh8bhqfZfxDHH7rQ3nyI8p2jxVifA,2500
159
+ ingestr/src/socrata_source/settings.py,sha256=DLfu-4HOa5nR7h9tbOySEa2ye3w_Z6TYZ9_zPqWaNQk,220
155
160
  ingestr/src/solidgate/__init__.py,sha256=Ts83j-JSnFsFuF4tDhVOfZKg7H0-bIpfn3kg1ZOR58A,8003
156
161
  ingestr/src/solidgate/helpers.py,sha256=mAsW_1hpD7ab3Y2vw8fxHi4yD3aT1geLdIYZ7ycyxBc,5690
157
162
  ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -184,8 +189,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
184
189
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
185
190
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
186
191
  ingestr/tests/unit/test_smartsheets.py,sha256=zf3DXT29Y4TH2lNPBFphdjlaelUUyPJcsW2UO68RzDs,4862
187
- ingestr-0.14.93.dist-info/METADATA,sha256=ttKTQKjoXX_xzXbQb2LisUnePWrFx5GXQf2dHCsG48g,15327
188
- ingestr-0.14.93.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
189
- ingestr-0.14.93.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
190
- ingestr-0.14.93.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
191
- ingestr-0.14.93.dist-info/RECORD,,
192
+ ingestr-0.14.96.dist-info/METADATA,sha256=vnkdaQVPvlnpHq9UgecuzRSSb_IiKE6_gS1jLkYzGEY,15359
193
+ ingestr-0.14.96.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
194
+ ingestr-0.14.96.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
195
+ ingestr-0.14.96.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
196
+ ingestr-0.14.96.dist-info/RECORD,,