quickbase-extract 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,234 @@
1
+ """Unified cache management for local dev and Lambda environments."""
2
+
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import boto3
8
+
9
+ from quickbase_extract.utils import normalize_name
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class CacheManager:
15
+ """Manages cache reads/writes for both local and Lambda environments.
16
+
17
+ Supports local file-based caching and S3-backed caching on Lambda.
18
+ Cache root path is configurable via QUICKBASE_CACHE_ROOT environment variable.
19
+ """
20
+
21
+ def __init__(self, cache_root: Path | None = None):
22
+ """Initialize the cache manager.
23
+
24
+ Args:
25
+ cache_root: Path to cache root directory. If not provided, uses
26
+ QUICKBASE_CACHE_ROOT env var, or defaults based on environment.
27
+ """
28
+ self.is_lambda = bool(os.environ.get("AWS_LAMBDA_FUNCTION_NAME"))
29
+ self.environment = os.environ.get("ENV", "dev")
30
+ self.s3_bucket = os.environ.get("CACHE_BUCKET")
31
+ self.s3_client = boto3.client("s3") if self.is_lambda else None
32
+
33
+ # Determine cache root path
34
+ if cache_root:
35
+ # Explicitly provided
36
+ self.cache_root = Path(cache_root)
37
+ elif os.environ.get("QUICKBASE_CACHE_ROOT"):
38
+ # From environment variable
39
+ self.cache_root = Path(os.environ.get("QUICKBASE_CACHE_ROOT"))
40
+ else:
41
+ # Default based on environment
42
+ if self.is_lambda:
43
+ self.cache_root = Path("/tmp/quickbase-extract/data")
44
+ else:
45
+ # Local: use current working directory or home
46
+ self.cache_root = Path.cwd() / ".quickbase-cache" / self.environment
47
+
48
+ self.cache_root.mkdir(parents=True, exist_ok=True)
49
+ logger.debug(f"Cache root: {self.cache_root}")
50
+
51
+ def get_metadata_path(self, app_name: str, table_name: str, report_name: str) -> Path:
52
+ """Get path for report metadata file.
53
+
54
+ Args:
55
+ app_name: Application name.
56
+ table_name: Table name.
57
+ report_name: Report name.
58
+
59
+ Returns:
60
+ Path object for the metadata file.
61
+
62
+ Example:
63
+ >>> cache_mgr.get_metadata_path("Sales Tracker", "Opportunities", "Open Deals")
64
+ PosixPath('.quickbase-cache/dev/report_metadata/sales_tracker/opportunities_open_deals.json')
65
+ """
66
+ app_fmt = normalize_name(app_name)
67
+ table_fmt = normalize_name(table_name)
68
+ report_fmt = normalize_name(report_name)
69
+
70
+ path = self.cache_root / "report_metadata" / app_fmt / f"{table_fmt}_{report_fmt}.json"
71
+ path.parent.mkdir(parents=True, exist_ok=True)
72
+ return path
73
+
74
+ def get_data_path(self, app_name: str, table_name: str, report_name: str) -> Path:
75
+ """Get path for report data file.
76
+
77
+ Args:
78
+ app_name: Application name.
79
+ table_name: Table name.
80
+ report_name: Report name.
81
+
82
+ Returns:
83
+ Path object for the data file.
84
+
85
+ Example:
86
+ >>> cache_mgr.get_data_path("Sales Tracker", "Opportunities", "Open Deals")
87
+ PosixPath('.quickbase-cache/dev/report_data/sales_tracker/opportunities_open_deals_data.json')
88
+ """
89
+ app_fmt = normalize_name(app_name)
90
+ table_fmt = normalize_name(table_name)
91
+ report_fmt = normalize_name(report_name)
92
+
93
+ path = self.cache_root / "report_data" / app_fmt / f"{table_fmt}_{report_fmt}_data.json"
94
+ path.parent.mkdir(parents=True, exist_ok=True)
95
+ return path
96
+
97
+ def write_file(self, file_path: Path, content: str) -> None:
98
+ """Write cache file and sync to S3 if on Lambda.
99
+
100
+ Args:
101
+ file_path: Path where file should be written.
102
+ content: String content to write.
103
+
104
+ Raises:
105
+ Exception: If S3 sync fails on Lambda (required for operation success).
106
+
107
+ Example:
108
+ >>> cache_mgr.write_file(Path("metadata.json"), '{"key": "value"}')
109
+ """
110
+ file_path.parent.mkdir(parents=True, exist_ok=True)
111
+ file_path.write_text(content)
112
+
113
+ if self.is_lambda and self.s3_client:
114
+ self._sync_to_s3(file_path)
115
+
116
+ def read_file(self, file_path: Path) -> str:
117
+ """Read cache file.
118
+
119
+ Args:
120
+ file_path: Path to file to read.
121
+
122
+ Returns:
123
+ File contents as string.
124
+
125
+ Raises:
126
+ FileNotFoundError: If file does not exist.
127
+
128
+ Example:
129
+ >>> content = cache_mgr.read_file(Path("metadata.json"))
130
+ """
131
+ if not file_path.exists():
132
+ raise FileNotFoundError(f"Cache file not found: {file_path}")
133
+ return file_path.read_text()
134
+
135
+ def _sync_to_s3(self, file_path: Path) -> None:
136
+ """Upload file to S3 for persistence across Lambda invocations.
137
+
138
+ Args:
139
+ file_path: Path to file to upload.
140
+
141
+ Raises:
142
+ Exception: If upload fails. This is critical - Lambda /tmp is ephemeral.
143
+ """
144
+ try:
145
+ relative_path = file_path.relative_to(self.cache_root)
146
+ s3_key = f"{self.environment}/{relative_path}"
147
+ self.s3_client.upload_file(str(file_path), self.s3_bucket, s3_key)
148
+ logger.info(f"Synced {s3_key} to S3")
149
+ except Exception as e:
150
+ logger.error(f"Failed to sync {file_path} to S3: {e}")
151
+ raise
152
+
153
+ def sync_from_s3(self) -> None:
154
+ """Download all cache files from S3 to /tmp (Lambda only).
155
+
156
+ Restores cache from S3 at Lambda initialization. Only runs on Lambda.
157
+ Logs and continues if bucket not configured.
158
+
159
+ Raises:
160
+ Exception: If S3 operations fail.
161
+
162
+ Note:
163
+ Lambda /tmp has storage limits (default 512 MB, max 10 GB).
164
+ Current cache size (~32 MB) is well within limits.
165
+ """
166
+ if not self.is_lambda or not self.s3_client:
167
+ logger.debug("Not in Lambda or S3 client unavailable, skipping S3 sync")
168
+ return
169
+
170
+ if not self.s3_bucket:
171
+ logger.debug("CACHE_BUCKET not set, skipping S3 sync")
172
+ return
173
+
174
+ logger.info(f"Syncing cache from S3 for environment: {self.environment}")
175
+ try:
176
+ paginator = self.s3_client.get_paginator("list_objects_v2")
177
+ pages = paginator.paginate(Bucket=self.s3_bucket, Prefix=f"{self.environment}/")
178
+
179
+ file_count = 0
180
+ for page in pages:
181
+ for obj in page.get("Contents", []):
182
+ s3_key = obj["Key"]
183
+ if not s3_key or s3_key.endswith("/"):
184
+ continue
185
+
186
+ # Extract relative path (remove environment prefix)
187
+ relative_key = s3_key.replace(f"{self.environment}/", "", 1)
188
+ local_path = self.cache_root / relative_key
189
+
190
+ local_path.parent.mkdir(parents=True, exist_ok=True)
191
+ self.s3_client.download_file(self.s3_bucket, s3_key, str(local_path))
192
+ file_count += 1
193
+
194
+ logger.info(f"Synced {file_count} files from S3")
195
+ except Exception as e:
196
+ logger.error(f"Failed to sync from S3: {e}")
197
+ raise
198
+
199
+
200
+ # Singleton instance
201
+ _cache_manager: CacheManager | None = None
202
+
203
+
204
+ def get_cache_manager(cache_root: Path | None = None) -> CacheManager:
205
+ """Get or create cache manager singleton instance.
206
+
207
+ Args:
208
+ cache_root: Optional path to cache root. Only used on first call.
209
+ Subsequent calls ignore this parameter and return the existing instance.
210
+
211
+ Returns:
212
+ Singleton CacheManager instance.
213
+
214
+ Warning:
215
+ The cache_root parameter is only respected on the first call. If you need
216
+ to change cache locations, use CacheManager directly instead of the singleton.
217
+
218
+ Example:
219
+ >>> cache_mgr = get_cache_manager(Path("/custom/cache"))
220
+ >>> # Later calls ignore cache_root
221
+ >>> same_mgr = get_cache_manager(Path("/different/path")) # Returns first instance
222
+ """
223
+ global _cache_manager
224
+ if _cache_manager is None:
225
+ _cache_manager = CacheManager(cache_root=cache_root)
226
+ return _cache_manager
227
+
228
+
229
+ def _reset_cache_manager() -> None:
230
+ """Reset the singleton cache manager. For testing only."""
231
+ global _cache_manager
232
+ _cache_manager = None
233
+ _cache_manager = None
234
+ _cache_manager = None
@@ -0,0 +1,74 @@
1
+ """S3-backed cache sync for Lambda environments."""
2
+
3
+ import logging
4
+
5
+ from quickbase_extract.cache_manager import get_cache_manager
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Module-level flag to track if we've synced this Lambda invocation
10
+ _CACHE_SYNCED = False
11
+
12
+
13
+ def sync_from_s3_once(force: bool = False) -> None:
14
+ """Download cache from S3 to /tmp on Lambda cold start.
15
+
16
+ Only syncs if cache hasn't been synced in this invocation.
17
+ Subsequent calls are no-ops unless force=True.
18
+
19
+ On Lambda, the sync flag persists across warm invocations within the same
20
+ container, so warm starts skip the sync (Lambda /tmp persists). Only cold
21
+ starts trigger a sync.
22
+
23
+ On local environments, automatically detects if CACHE_BUCKET is configured.
24
+ If not configured, does nothing (local caching only).
25
+
26
+ Args:
27
+ force: If True, sync even if already synced in this invocation.
28
+ Defaults to False.
29
+
30
+ Raises:
31
+ Exception: If S3 operations fail.
32
+
33
+ Example:
34
+ >>> # In Lambda handler initialization
35
+ >>> sync_from_s3_once() # Syncs on cold start
36
+ >>> sync_from_s3_once() # No-op on same invocation
37
+ >>>
38
+ >>> # Force re-sync if needed
39
+ >>> sync_from_s3_once(force=True)
40
+ """
41
+ global _CACHE_SYNCED
42
+
43
+ if _CACHE_SYNCED and not force:
44
+ logger.debug("Cache already synced in this invocation, skipping")
45
+ return
46
+
47
+ cache_mgr = get_cache_manager()
48
+ cache_mgr.sync_from_s3() # Handles Lambda detection internally
49
+ _CACHE_SYNCED = True
50
+ logger.info("Cache synced from S3")
51
+
52
+
53
+ def is_cache_synced() -> bool:
54
+ """Check if cache has been synced in this invocation.
55
+
56
+ Returns:
57
+ True if cache has been synced, False otherwise.
58
+
59
+ Example:
60
+ >>> if not is_cache_synced():
61
+ ... print("Cache needs syncing")
62
+ """
63
+ return _CACHE_SYNCED
64
+
65
+
66
+ def _reset_cache_sync() -> None:
67
+ """Reset the cache sync flag. For testing only.
68
+
69
+ Example:
70
+ >>> # In test teardown
71
+ >>> _reset_cache_sync()
72
+ """
73
+ global _CACHE_SYNCED
74
+ _CACHE_SYNCED = False
@@ -0,0 +1,61 @@
1
+ """Quickbase API client factory."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+ import quickbase_api
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # Cache for client instances (realm, user_token) -> client
11
+ _client_cache: dict[tuple[str, str], Any] = {}
12
+
13
+
14
+ def get_qb_client(realm: str, user_token: str, cache: bool = True) -> Any:
15
+ """Create and return a Quickbase API client.
16
+
17
+ Clients are cached by (realm, token) combination to avoid recreating
18
+ connections. Use cache=False to force a new client instance.
19
+
20
+ Args:
21
+ realm: Quickbase realm (e.g., 'example.quickbase.com').
22
+ user_token: Quickbase user token (from environment or config).
23
+ cache: Whether to reuse cached client. Defaults to True.
24
+
25
+ Returns:
26
+ Quickbase API client instance.
27
+
28
+ Raises:
29
+ ValueError: If realm or user_token is empty.
30
+ Exception: If client creation fails.
31
+ """
32
+ # Input validation
33
+ if not realm or not realm.strip():
34
+ raise ValueError("Realm cannot be empty")
35
+ if not user_token or not user_token.strip():
36
+ raise ValueError("User token cannot be empty")
37
+
38
+ # Check cache
39
+ cache_key = (realm, user_token)
40
+ if cache and cache_key in _client_cache:
41
+ logger.debug(f"Returning cached Quickbase client for realm: {realm}")
42
+ return _client_cache[cache_key]
43
+
44
+ # Create new client
45
+ try:
46
+ client = quickbase_api.client(realm=realm, user_token=user_token)
47
+ logger.debug(f"Created Quickbase client for realm: {realm}")
48
+
49
+ if cache:
50
+ _client_cache[cache_key] = client
51
+
52
+ return client
53
+ except Exception as e:
54
+ logger.error(f"Failed to create Quickbase client for realm {realm}: {e}")
55
+ raise
56
+
57
+
58
+ def _reset_client_cache() -> None:
59
+ """Clear the client cache. For testing only."""
60
+ global _client_cache
61
+ _client_cache = {}
File without changes
@@ -0,0 +1,253 @@
1
+ """Quickbase data fetching, caching, and loading."""
2
+
3
+ import json
4
+ import logging
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+ from quickbase_extract.api_handlers import handle_query
8
+ from quickbase_extract.cache_manager import get_cache_manager
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def _flatten_and_relabel_records(records: list[dict], field_label: dict, fields: list[int]) -> list[dict]:
14
+ """Transform Quickbase records to flat dicts with field labels as keys.
15
+
16
+ Args:
17
+ records: List of records from Quickbase API (nested format).
18
+ field_label: Dict mapping field labels to IDs.
19
+ fields: List of field IDs in desired order.
20
+
21
+ Returns:
22
+ List of dicts with field labels as keys.
23
+ """
24
+ # Build reverse mapping: field ID -> label
25
+ id_to_label = {v: k for k, v in field_label.items()}
26
+ field_order = [str(f) for f in fields]
27
+
28
+ final_list = []
29
+ for record in records:
30
+ # Flatten: {field_id: {value: actual}} -> {field_id: actual}
31
+ flat = {fid: val["value"] for fid, val in record.items()}
32
+
33
+ # Re-order to match report field order
34
+ ordered = {fid: flat[fid] for fid in field_order if fid in flat}
35
+
36
+ # Swap field IDs with labels
37
+ labeled = {id_to_label[fid]: val for fid, val in ordered.items() if fid in id_to_label}
38
+
39
+ final_list.append(labeled)
40
+
41
+ return final_list
42
+
43
+
44
+ def get_data(
45
+ client,
46
+ report_metadata: dict,
47
+ report_desc: str,
48
+ cache: bool = False,
49
+ cache_root=None,
50
+ ) -> list[dict]:
51
+ """Query a Quickbase table for data using cached report metadata.
52
+
53
+ Args:
54
+ client: Quickbase API client.
55
+ report_metadata: Full metadata dict (from load_report_metadata_batch).
56
+ report_desc: Unique description of a specific table report.
57
+ cache: Whether to cache the retrieved data. Defaults to False.
58
+ cache_root: Optional cache root path. If not provided, uses CacheManager default.
59
+
60
+ Returns:
61
+ List of dicts with field labels as keys.
62
+
63
+ Raises:
64
+ KeyError: If report_desc not found in report_metadata.
65
+ Exception: If Quickbase API query fails.
66
+
67
+ Example:
68
+ >>> metadata = load_report_metadata_batch(configs)
69
+ >>> data = get_data(client, metadata, "sales_open_deals", cache=True)
70
+ >>> print(f"Found {len(data)} records")
71
+ """
72
+ info = report_metadata[report_desc]
73
+
74
+ app_name = info["app_name"]
75
+ table_name = info["table_name"]
76
+ report_name = info["report_name"]
77
+
78
+ # Query Quickbase
79
+ query_data = handle_query(
80
+ client,
81
+ info["table_id"],
82
+ select=info["fields"],
83
+ where=info["filter"],
84
+ sort_by=info["report"]["query"]["sortBy"],
85
+ )
86
+ data = query_data["data"]
87
+
88
+ # Transform records
89
+ final_list = _flatten_and_relabel_records(data, info["field_label"], info["fields"])
90
+
91
+ # Cache if requested
92
+ if cache:
93
+ cache_mgr = get_cache_manager(cache_root=cache_root)
94
+ data_path = cache_mgr.get_data_path(app_name, table_name, report_name)
95
+ cache_mgr.write_file(data_path, json.dumps(final_list, indent=4))
96
+ logger.info(f"{report_desc} data cached ({len(final_list)} records)")
97
+ else:
98
+ logger.info(f"{report_desc} data fetched but not cached ({len(final_list)} records)")
99
+
100
+ return final_list
101
+
102
+
103
+ def get_data_parallel(
104
+ client,
105
+ report_metadata: dict,
106
+ report_descriptions: list[str],
107
+ cache: bool = False,
108
+ cache_root=None,
109
+ max_workers: int = 8,
110
+ ) -> dict[str, list[dict]]:
111
+ """Fetch multiple reports in parallel using cached report metadata.
112
+
113
+ Executes data fetching for multiple reports concurrently to improve
114
+ performance. Uses a fail-fast approach: if any report fetch fails,
115
+ all remaining tasks are cancelled and the exception is raised immediately.
116
+
117
+ Args:
118
+ client: Quickbase API client. Should be thread-safe for concurrent use.
119
+ report_metadata: Full metadata dict (from load_report_metadata_batch).
120
+ report_descriptions: List of report descriptions to fetch.
121
+ cache: Whether to cache retrieved data. Defaults to False.
122
+ cache_root: Optional cache root path. If not provided, uses CacheManager default.
123
+ max_workers: Maximum number of concurrent threads. Default is 8.
124
+ Adjust based on API rate limits and system resources.
125
+
126
+ Returns:
127
+ Dict mapping report_description -> list of record dicts.
128
+
129
+ Raises:
130
+ KeyError: If any report_desc not found in report_metadata.
131
+ Exception: First exception encountered during parallel execution.
132
+ All pending tasks are cancelled when an error occurs.
133
+
134
+ Example:
135
+ >>> metadata = load_report_metadata_batch(configs)
136
+ >>> descriptions = ["sales_open_deals", "sales_contacts"]
137
+ >>> all_data = get_data_parallel(client, metadata, descriptions, cache=True)
138
+ >>> print(f"Fetched {len(all_data)} reports")
139
+
140
+ Note:
141
+ - Ensure the Quickbase client can handle concurrent requests
142
+ - Consider API rate limits when setting max_workers
143
+ - All tasks are cancelled on first failure (fail-fast behavior)
144
+ """
145
+ if not report_descriptions:
146
+ logger.warning("No report descriptions provided, nothing to fetch")
147
+ return {}
148
+
149
+ total_reports = len(report_descriptions)
150
+ logger.info(f"Starting parallel fetch for {total_reports} reports with {max_workers} workers")
151
+
152
+ results = {}
153
+
154
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
155
+ # Submit all tasks (fixed argument order)
156
+ future_to_report = {
157
+ executor.submit(
158
+ get_data,
159
+ client,
160
+ report_metadata, # Fixed: was swapped with report_desc
161
+ report_desc, # Fixed: was swapped with report_metadata
162
+ cache=cache,
163
+ cache_root=cache_root,
164
+ ): report_desc
165
+ for report_desc in report_descriptions
166
+ }
167
+
168
+ # Process as they complete, fail fast on first error
169
+ for future in as_completed(future_to_report):
170
+ report_desc = future_to_report[future]
171
+ try:
172
+ data = future.result() # Individual fetches are logged in get_data
173
+ results[report_desc] = data
174
+ except Exception as e:
175
+ # Cancel all remaining tasks
176
+ executor.shutdown(wait=False, cancel_futures=True)
177
+ logger.error(f"Failed to fetch {report_desc}: {e}")
178
+ raise
179
+
180
+ logger.info(f"Successfully completed parallel fetch for all {total_reports} reports")
181
+ return results
182
+
183
+
184
+ def load_data(report_metadata: dict, report_desc: str, cache_root=None) -> list[dict]:
185
+ """Load cached data for a Quickbase report.
186
+
187
+ Args:
188
+ report_metadata: Full metadata dict (from load_report_metadata_batch).
189
+ report_desc: Unique description of a specific table report.
190
+ cache_root: Optional cache root path. If not provided, uses CacheManager default.
191
+
192
+ Returns:
193
+ List of dicts with field labels as keys.
194
+
195
+ Raises:
196
+ KeyError: If report_desc not found in report_metadata.
197
+ FileNotFoundError: If cached data does not exist.
198
+
199
+ Example:
200
+ >>> metadata = load_report_metadata_batch(configs)
201
+ >>> data = load_data(metadata, "sales_open_deals")
202
+ >>> print(f"Loaded {len(data)} records from cache")
203
+ """
204
+ info = report_metadata[report_desc]
205
+ app_name = info["app_name"]
206
+ table_name = info["table_name"]
207
+ report_name = info["report_name"]
208
+
209
+ cache_mgr = get_cache_manager(cache_root=cache_root)
210
+ data_path = cache_mgr.get_data_path(app_name, table_name, report_name)
211
+
212
+ if not data_path.exists():
213
+ raise FileNotFoundError(f"Cached data not found for '{report_desc}'. Expected: {data_path}")
214
+
215
+ return json.loads(cache_mgr.read_file(data_path))
216
+
217
+
218
+ def load_data_batch(
219
+ report_metadata: dict,
220
+ report_descriptions: list[str],
221
+ cache_root=None,
222
+ ) -> dict[str, list[dict]]:
223
+ """Load cached data for multiple reports.
224
+
225
+ Sequentially loads cached data for each report description.
226
+ This is a batch wrapper around load_data for convenience.
227
+
228
+ Args:
229
+ report_metadata: Full metadata dict (from load_report_metadata_batch).
230
+ report_descriptions: List of report descriptions to load.
231
+ cache_root: Optional cache root path. If not provided, uses
232
+ CacheManager default.
233
+
234
+ Returns:
235
+ Dict mapping report_description -> list of record dicts.
236
+
237
+ Raises:
238
+ KeyError: If any report_desc not found in report_metadata.
239
+ FileNotFoundError: If any cached data does not exist.
240
+
241
+ Example:
242
+ >>> metadata = load_report_metadata_batch(configs)
243
+ >>> descriptions = ["sales_open_deals", "sales_contacts"]
244
+ >>> all_data = load_data_batch(metadata, descriptions)
245
+ >>> print(f"Loaded {len(all_data)} reports from cache")
246
+ """
247
+ if not report_descriptions:
248
+ return {}
249
+
250
+ data = {}
251
+ for report_desc in report_descriptions:
252
+ data[report_desc] = load_data(report_metadata, report_desc, cache_root=cache_root)
253
+ return data