ingestr 0.4.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (77) hide show
  1. {ingestr-0.4.0 → ingestr-0.5.1}/Dockerfile +1 -1
  2. {ingestr-0.4.0 → ingestr-0.5.1}/PKG-INFO +7 -1
  3. {ingestr-0.4.0 → ingestr-0.5.1}/README.md +5 -0
  4. {ingestr-0.4.0 → ingestr-0.5.1}/docs/.vitepress/config.mjs +4 -1
  5. ingestr-0.5.1/docs/supported-sources/gsheets.md +41 -0
  6. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/overview.md +5 -0
  7. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/main.py +14 -1
  8. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/factory.py +9 -1
  9. ingestr-0.5.1/ingestr/src/google_sheets/README.md +95 -0
  10. ingestr-0.5.1/ingestr/src/google_sheets/__init__.py +152 -0
  11. ingestr-0.5.1/ingestr/src/google_sheets/helpers/__init__.py +1 -0
  12. ingestr-0.5.1/ingestr/src/google_sheets/helpers/api_calls.py +146 -0
  13. ingestr-0.5.1/ingestr/src/google_sheets/helpers/data_processing.py +302 -0
  14. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/sources.py +46 -0
  15. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/sql_database/override.py +1 -0
  16. ingestr-0.5.1/ingestr/src/version.py +1 -0
  17. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/testdata/test_append.db +0 -0
  18. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/testdata/test_create_replace.db +0 -0
  19. ingestr-0.4.0/ingestr/testdata/test_merge_with_primary_key.db → ingestr-0.5.1/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
  20. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
  21. ingestr-0.4.0/ingestr/testdata/test_delete_insert_with_timerange.db → ingestr-0.5.1/ingestr/testdata/test_merge_with_primary_key.db +0 -0
  22. {ingestr-0.4.0 → ingestr-0.5.1}/pyproject.toml +2 -0
  23. {ingestr-0.4.0 → ingestr-0.5.1}/requirements.txt +1 -0
  24. ingestr-0.4.0/ingestr/src/version.py +0 -1
  25. {ingestr-0.4.0 → ingestr-0.5.1}/.dockerignore +0 -0
  26. {ingestr-0.4.0 → ingestr-0.5.1}/.github/workflows/deploy-docs.yml +0 -0
  27. {ingestr-0.4.0 → ingestr-0.5.1}/.github/workflows/docker.yml +0 -0
  28. {ingestr-0.4.0 → ingestr-0.5.1}/.gitignore +0 -0
  29. {ingestr-0.4.0 → ingestr-0.5.1}/LICENSE.md +0 -0
  30. {ingestr-0.4.0 → ingestr-0.5.1}/Makefile +0 -0
  31. {ingestr-0.4.0 → ingestr-0.5.1}/docs/.vitepress/theme/custom.css +0 -0
  32. {ingestr-0.4.0 → ingestr-0.5.1}/docs/.vitepress/theme/index.js +0 -0
  33. {ingestr-0.4.0 → ingestr-0.5.1}/docs/commands/example-uris.md +0 -0
  34. {ingestr-0.4.0 → ingestr-0.5.1}/docs/commands/ingest.md +0 -0
  35. {ingestr-0.4.0 → ingestr-0.5.1}/docs/getting-started/core-concepts.md +0 -0
  36. {ingestr-0.4.0 → ingestr-0.5.1}/docs/getting-started/incremental-loading.md +0 -0
  37. {ingestr-0.4.0 → ingestr-0.5.1}/docs/getting-started/quickstart.md +0 -0
  38. {ingestr-0.4.0 → ingestr-0.5.1}/docs/getting-started/telemetry.md +0 -0
  39. {ingestr-0.4.0 → ingestr-0.5.1}/docs/index.md +0 -0
  40. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/bigquery.md +0 -0
  41. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/csv.md +0 -0
  42. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/databricks.md +0 -0
  43. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/duckdb.md +0 -0
  44. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/mongodb.md +0 -0
  45. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/mssql.md +0 -0
  46. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/mysql.md +0 -0
  47. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/notion.md +0 -0
  48. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/oracle.md +0 -0
  49. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/postgres.md +0 -0
  50. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/redshift.md +0 -0
  51. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/sap-hana.md +0 -0
  52. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/snowflake.md +0 -0
  53. {ingestr-0.4.0 → ingestr-0.5.1}/docs/supported-sources/sqlite.md +0 -0
  54. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/main_test.py +0 -0
  55. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/destinations.py +0 -0
  56. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/destinations_test.py +0 -0
  57. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/factory_test.py +0 -0
  58. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/mongodb/__init__.py +0 -0
  59. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/mongodb/helpers.py +0 -0
  60. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/notion/__init__.py +0 -0
  61. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/notion/helpers/__init__.py +0 -0
  62. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/notion/helpers/client.py +0 -0
  63. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/notion/helpers/database.py +0 -0
  64. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/notion/settings.py +0 -0
  65. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/sources_test.py +0 -0
  66. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/sql_database/__init__.py +0 -0
  67. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/sql_database/helpers.py +0 -0
  68. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/sql_database/schema_types.py +0 -0
  69. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/telemetry/event.py +0 -0
  70. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/src/testdata/fakebqcredentials.json +0 -0
  71. {ingestr-0.4.0 → ingestr-0.5.1}/ingestr/testdata/.gitignore +0 -0
  72. {ingestr-0.4.0 → ingestr-0.5.1}/package-lock.json +0 -0
  73. {ingestr-0.4.0 → ingestr-0.5.1}/package.json +0 -0
  74. {ingestr-0.4.0 → ingestr-0.5.1}/requirements-dev.txt +0 -0
  75. {ingestr-0.4.0 → ingestr-0.5.1}/resources/demo.gif +0 -0
  76. {ingestr-0.4.0 → ingestr-0.5.1}/resources/demo.tape +0 -0
  77. {ingestr-0.4.0 → ingestr-0.5.1}/resources/ingestr.svg +0 -0
@@ -10,7 +10,7 @@ ENV VIRTUAL_ENV=/usr/local
10
10
  ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
11
11
  RUN /install.sh && rm /install.sh
12
12
 
13
- RUN /root/.cargo/bin/uv pip install --no-cache -r requirements.txt
13
+ RUN /root/.cargo/bin/uv pip install --system --no-cache -r requirements.txt
14
14
 
15
15
  COPY . /app
16
16
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ingestr
3
- Version: 0.4.0
3
+ Version: 0.5.1
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -19,6 +19,7 @@ Requires-Dist: databricks-sql-connector==2.9.3
19
19
  Requires-Dist: dlt==0.4.8
20
20
  Requires-Dist: duckdb-engine==0.11.5
21
21
  Requires-Dist: duckdb==0.10.2
22
+ Requires-Dist: google-api-python-client==2.130.0
22
23
  Requires-Dist: google-cloud-bigquery-storage==2.24.0
23
24
  Requires-Dist: pendulum==3.0.0
24
25
  Requires-Dist: psycopg2-binary==2.9.9
@@ -169,6 +170,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
169
170
  <tr>
170
171
  <td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
171
172
  </tr>
173
+ <tr>
174
+ <td>Google Sheets</td>
175
+ <td>✅</td>
176
+ <td>❌</td>
177
+ </tr>
172
178
  <tr>
173
179
  <td>Notion</td>
174
180
  <td>✅</td>
@@ -128,6 +128,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
128
128
  <tr>
129
129
  <td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
130
130
  </tr>
131
+ <tr>
132
+ <td>Google Sheets</td>
133
+ <td>✅</td>
134
+ <td>❌</td>
135
+ </tr>
131
136
  <tr>
132
137
  <td>Notion</td>
133
138
  <td>✅</td>
@@ -67,7 +67,10 @@ export default defineConfig({
67
67
  {
68
68
  text: "Platforms",
69
69
  collapsed: false,
70
- items: [{ text: "Notion", link: "/supported-sources/notion.md" }],
70
+ items: [
71
+ { text: "Google Sheets", link: "/supported-sources/gsheets.md" },
72
+ { text: "Notion", link: "/supported-sources/notion.md" },
73
+ ],
71
74
  },
72
75
  ],
73
76
  },
@@ -0,0 +1,41 @@
1
+ # Google Sheets
2
+ [Google Sheets](https://www.google.com/sheets/about/) is a web-based spreadsheet program that is part of Google's free, web-based Google Docs Editors suite.
3
+
4
+ ingestr supports Google Sheets as a source.
5
+
6
+ ## URI Format
7
+ The URI format for Google Sheets is as follows:
8
+
9
+ ```
10
+ gsheets://?credentials_path=/path/to/service/account.json
11
+ ```
12
+
13
+ Alternatively, you can use base64 encoded credentials:
14
+ ```
15
+ gsheets://?credentials_base64=<base64_encoded_credentials>
16
+ ```
17
+
18
+ URI parameters:
19
+ - `credentials_path`: the path to the service account JSON file
20
+
21
+ The URI is used to connect to the Google Sheets API for extracting data.
22
+
23
+ ## Setting up a Google Sheets Integration
24
+
25
+ Google Sheets requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#setup-guide).
26
+
27
+ Once you complete the guide, you should have a service account JSON file, and the spreadsheet ID to connect to. Let's say:
28
+ - you store your JSON file in path `/path/to/file.json`
29
+ - the spreadsheet you'd like to connect to is `abcdxyz`,
30
+ - the sheet inside the spreadsheet is `Sheet1`
31
+
32
+ Based on this assumption, here's a sample command that will copy the data from the Google Sheets spreadsheet into a duckdb database:
33
+
34
+ ```sh
35
+ ingestr ingest --source-uri 'gsheets://?credentials_path=/path/to/file.json' --source-table 'abcdxyz.Sheet1' --dest-uri duckdb:///gsheets.duckdb --dest-table 'gsheets.output'
36
+ ```
37
+
38
+ The result of this command will be a table in the \`gsheets.duckdb\` database.
39
+
40
+ > [!CAUTION]
41
+ > Google Sheets does not support incremental loading, which means every time you run the command, it will copy the entire spreadsheet from Google Sheets to the destination. This can be slow for large spreadsheets.
@@ -79,6 +79,11 @@ ingestr supports the following sources and destinations:
79
79
  <tr>
80
80
  <td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
81
81
  </tr>
82
+ <tr>
83
+ <td>Google Sheets</td>
84
+ <td>✅</td>
85
+ <td>❌</td>
86
+ </tr>
82
87
  <tr>
83
88
  <td>Notion</td>
84
89
  <td>✅</td>
@@ -6,6 +6,7 @@ from typing import Optional
6
6
  import dlt
7
7
  import humanize
8
8
  import typer
9
+ from dlt.common.pipeline import LoadInfo
9
10
  from dlt.common.runtime.collector import Collector, LogCollector
10
11
  from rich.console import Console
11
12
  from rich.status import Status
@@ -310,7 +311,7 @@ def ingest(
310
311
  ):
311
312
  loader_file_format = None
312
313
 
313
- run_info = pipeline.run(
314
+ run_info: LoadInfo = pipeline.run(
314
315
  dlt_source,
315
316
  **destination.dlt_run_params(
316
317
  uri=dest_uri,
@@ -323,6 +324,18 @@ def ingest(
323
324
  else None, # type: ignore
324
325
  )
325
326
 
327
+ for load_package in run_info.load_packages:
328
+ failed_jobs = load_package.jobs["failed_jobs"]
329
+ if len(failed_jobs) > 0:
330
+ print()
331
+ print("[bold red]Failed jobs:[/bold red]")
332
+ print()
333
+ for job in failed_jobs:
334
+ print(f"[bold red] {job.job_file_info.job_id()}[/bold red]")
335
+ print(f" [bold yellow]Error:[/bold yellow] {job.failed_message}")
336
+
337
+ raise typer.Exit()
338
+
326
339
  destination.post_load()
327
340
 
328
341
  elapsedHuman = ""
@@ -14,7 +14,13 @@ from ingestr.src.destinations import (
14
14
  SnowflakeDestination,
15
15
  SynapseDestination,
16
16
  )
17
- from ingestr.src.sources import LocalCsvSource, MongoDbSource, NotionSource, SqlSource
17
+ from ingestr.src.sources import (
18
+ GoogleSheetsSource,
19
+ LocalCsvSource,
20
+ MongoDbSource,
21
+ NotionSource,
22
+ SqlSource,
23
+ )
18
24
 
19
25
  SQL_SOURCE_SCHEMES = [
20
26
  "bigquery",
@@ -83,6 +89,8 @@ class SourceDestinationFactory:
83
89
  return MongoDbSource()
84
90
  elif self.source_scheme == "notion":
85
91
  return NotionSource()
92
+ elif self.source_scheme == "gsheets":
93
+ return GoogleSheetsSource()
86
94
  else:
87
95
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
88
96
 
@@ -0,0 +1,95 @@
1
+ # Google Sheets
2
+
3
+ ## Prepare your data
4
+
5
+ We recommend to to use [Named Ranges](link to gsheets) to indicate which data should be extracted from a particular spreadsheet and this is how this source
6
+ will work by default - when called with without setting any other options. All the named ranges will be converted into tables named after them and stored in the
7
+ destination.
8
+ * You can let the spreadsheet users to add and remove tables by just adding/removing the ranges, you do not need to configure the pipeline again.
9
+ * You can indicate exactly the fragments of interest and only this data will be retrieved so it is the fastest.
10
+ * You can name database tables by changing the range names.
11
+
12
+ If you are not happy with the workflow above, you can:
13
+ * Disable it by setting `get_named_ranges` option to False
14
+ * Enable retrieving all sheets/tabs with `get_sheets` option set to True
15
+ * Pass a list of ranges as supported by Google Sheets in `range_names`
16
+
17
+ Note that hidden columns will be extracted.
18
+
19
+ > 💡 You can load data from many spreadsheets and also rename the tables to which data is loaded. This is standard part of `dlt`, see `load_with_table_rename_and_multiple_spreadsheets` demo in `google_sheets_pipeline.py`
20
+
21
+ ### Make sure your data has headers and is a proper table
22
+ **First row of any extracted range should contain headers**. Please make sure:
23
+ 1. The header names are strings and are unique.
24
+ 2. That all the columns that you intend to extract have a header.
25
+ 3. That data starts exactly at the origin of the range - otherwise source will remove padding but it is a waste of resources!
26
+
27
+ When source detects any problems with headers or table layout **it will issue a WARNING in the log** so it makes sense to run your pipeline script manually/locally and fix all the problems.
28
+ 1. Columns without headers will be removed and not extracted!
29
+ 2. Columns with headers that does not contain any data will be removed.
30
+ 2. If there's any problems with reading headers (ie. header is not string or is empty or not unique): **the headers row will be extracted as data** and automatic header names will be used.
31
+ 3. Empty rows are ignored
32
+ 4. `dlt` will normalize range names and headers into table and column names - so they may be different in the database than in google sheets. Prefer small cap names without special characters!
33
+
34
+ ### Data Types
35
+ `dlt` normalizer will use first row of data to infer types and will try to coerce following rows - creating variant columns if that is not possible. This is a standard behavior.
36
+ **date time** and **date** types are also recognized and this happens via additional metadata that is retrieved for the first row.
37
+
38
+ ## Passing the spreadsheet id/url and explicit range names
39
+ You can use both url of your spreadsheet that you can copy from the browser ie.
40
+ ```
41
+ https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing
42
+ ```
43
+ or spreadsheet id (which is a part of the url)
44
+ ```
45
+ 1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4
46
+ ```
47
+ typically you pass it directly to the `google_spreadsheet` function
48
+
49
+ **passing ranges**
50
+
51
+ You can pass explicit ranges to the `google_spreadsheet`:
52
+ 1. sheet names
53
+ 2. named ranges
54
+ 3. any range in Google Sheet format ie. **sheet 1!A1:B7**
55
+
56
+
57
+ ## The `spreadsheet_info` table
58
+ This table is repopulated after every load and keeps the information on loaded ranges:
59
+ * id and title of the spreadsheet
60
+ * name of the range as passed to the source
61
+ * string representation of the loaded range
62
+ * range above in parsed representation
63
+
64
+ ## Running on Airflow (and some under the hood information)
65
+ Internally, the source loads all the data immediately in the `google_spreadsheet` before execution of the pipeline in `run`. No matter how many ranges you request, we make just two calls to the API to retrieve data. This works very well with typical scripts that create a dlt source with `google_spreadsheet` and then run it with `pipeline.run`.
66
+
67
+ In case of Airflow, the source is created and executed separately. In typical configuration where runner is a separate machine, **this will load data twice**.
68
+
69
+ **Moreover, you should not use `scc` decomposition in our Airflow helper**. It will create an instance of the source for each requested range in order to run a task that corresponds to it! Following our [Airflow deployment guide](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file), this is how you should use `tasks.add_run` on `PipelineTasksGroup`:
70
+ ```python
71
+ @dag(
72
+ schedule_interval='@daily',
73
+ start_date=pendulum.datetime(2023, 2, 1),
74
+ catchup=False,
75
+ max_active_runs=1,
76
+ default_args=default_task_args
77
+ )
78
+ def get_named_ranges():
79
+ tasks = PipelineTasksGroup("get_named_ranges", use_data_folder=False, wipe_local_data=True)
80
+
81
+ # import your source from pipeline script
82
+ from google_sheets import google_spreadsheet
83
+
84
+ pipeline = dlt.pipeline(
85
+ pipeline_name="get_named_ranges",
86
+ dataset_name="named_ranges_data",
87
+ destination='bigquery',
88
+ )
89
+
90
+ # do not use decompose to run `google_spreadsheet` in single task
91
+ tasks.add_run(pipeline, google_spreadsheet("1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"), decompose="none", trigger_rule="all_done", retries=0, provide_context=True)
92
+ ```
93
+
94
+ ## Setup credentials
95
+ [We recommend to use service account for any production deployments](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#google-sheets-api-authentication)
@@ -0,0 +1,152 @@
1
+ """Loads Google Sheets data from tabs, named and explicit ranges. Contains the main source functions."""
2
+
3
+ from typing import Iterable, Sequence, Union
4
+
5
+ import dlt
6
+ from dlt.common import logger
7
+ from dlt.sources import DltResource
8
+ from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
9
+
10
+ from .helpers import api_calls
11
+ from .helpers.api_calls import api_auth
12
+ from .helpers.data_processing import (
13
+ get_data_types,
14
+ get_range_headers,
15
+ get_spreadsheet_id,
16
+ process_range,
17
+ )
18
+
19
+
20
+ @dlt.source
21
+ def google_spreadsheet(
22
+ spreadsheet_url_or_id: str = dlt.config.value,
23
+ range_names: Sequence[str] = dlt.config.value,
24
+ credentials: Union[
25
+ GcpOAuthCredentials, GcpServiceAccountCredentials
26
+ ] = dlt.secrets.value,
27
+ get_sheets: bool = False,
28
+ get_named_ranges: bool = True,
29
+ max_api_retries: int = 5,
30
+ ) -> Iterable[DltResource]:
31
+ """
32
+ The source for the dlt pipeline. It returns the following resources:
33
+ - 1 dlt resource for every range in range_names.
34
+ - Optionally, dlt resources for all sheets inside the spreadsheet and all named ranges inside the spreadsheet.
35
+
36
+ Args:
37
+ spreadsheet_url_or_id (str): The ID or URL of the spreadsheet.
38
+ range_names (Sequence[str]): A list of ranges in the spreadsheet in the format used by Google Sheets. Accepts Named Ranges and Sheets (tabs) names.
39
+ These are the ranges to be converted into tables.
40
+ credentials (Union[GcpServiceAccountCredentials, GcpOAuthCredentials]): GCP credentials to the account
41
+ with Google Sheets API access, defined in dlt.secrets.
42
+ get_sheets (bool, optional): If True, load all the sheets inside the spreadsheet into the database.
43
+ Defaults to False.
44
+ get_named_ranges (bool, optional): If True, load all the named ranges inside the spreadsheet into the database.
45
+ Defaults to True.
46
+ max_api_retries (int, optional): Max number of retires to google sheets API. Actual behavior is internal to google client.
47
+
48
+ Yields:
49
+ Iterable[DltResource]: List of dlt resources.
50
+ """
51
+ # authenticate to the service using the helper function
52
+ service = api_auth(credentials, max_api_retries=max_api_retries)
53
+ # get spreadsheet id from url or id
54
+ spreadsheet_id = get_spreadsheet_id(spreadsheet_url_or_id)
55
+ all_range_names = set(range_names or [])
56
+ # if no explicit ranges, get sheets and named ranges from metadata
57
+ # get metadata with list of sheets and named ranges in the spreadsheet
58
+ sheet_names, named_ranges, spreadsheet_title = api_calls.get_known_range_names(
59
+ spreadsheet_id=spreadsheet_id, service=service
60
+ )
61
+ if not range_names:
62
+ if get_sheets:
63
+ all_range_names.update(sheet_names)
64
+ if get_named_ranges:
65
+ all_range_names.update(named_ranges)
66
+
67
+ # first we get all data for all the ranges (explicit or named)
68
+ all_range_data = api_calls.get_data_for_ranges(
69
+ service=service,
70
+ spreadsheet_id=spreadsheet_id,
71
+ range_names=list(all_range_names),
72
+ )
73
+ assert len(all_range_names) == len(
74
+ all_range_data
75
+ ), "Google Sheets API must return values for all requested ranges"
76
+
77
+ # get metadata for two first rows of each range
78
+ # first should contain headers
79
+ # second row contains data which we'll use to sample data types.
80
+ # google sheets return datetime and date types as lotus notes serial number. which is just a float so we cannot infer the correct types just from the data
81
+
82
+ # warn and remove empty ranges
83
+ range_data = []
84
+ metadata_table = []
85
+ for name, parsed_range, meta_range, values in all_range_data:
86
+ # # pass all ranges to spreadsheet info - including empty
87
+ # metadata_table.append(
88
+ # {
89
+ # "spreadsheet_id": spreadsheet_id,
90
+ # "title": spreadsheet_title,
91
+ # "range_name": name,
92
+ # "range": str(parsed_range),
93
+ # "range_parsed": parsed_range._asdict(),
94
+ # "skipped": True,
95
+ # }
96
+ # )
97
+ if values is None or len(values) == 0:
98
+ logger.warning(f"Range {name} does not contain any data. Skipping.")
99
+ continue
100
+ if len(values) == 1:
101
+ logger.warning(f"Range {name} contain only 1 row of data. Skipping.")
102
+ continue
103
+ if len(values[0]) == 0:
104
+ logger.warning(
105
+ f"First row of range {name} does not contain data. Skipping."
106
+ )
107
+ continue
108
+ # metadata_table[-1]["skipped"] = False
109
+ range_data.append((name, parsed_range, meta_range, values))
110
+
111
+ meta_values = api_calls.get_meta_for_ranges(
112
+ service, spreadsheet_id, [str(data[2]) for data in range_data]
113
+ )
114
+ for name, parsed_range, _, values in range_data:
115
+ logger.info(f"Processing range {parsed_range} with name {name}")
116
+ # here is a tricky part due to how Google Sheets API returns the metadata. We are not able to directly pair the input range names with returned metadata objects
117
+ # instead metadata objects are grouped by sheet names, still each group order preserves the order of input ranges
118
+ # so for each range we get a sheet name, we look for the metadata group for that sheet and then we consume first object on that list with pop
119
+ metadata = next(
120
+ sheet
121
+ for sheet in meta_values["sheets"]
122
+ if sheet["properties"]["title"] == parsed_range.sheet_name
123
+ )["data"].pop(0)
124
+
125
+ headers_metadata = metadata["rowData"][0]["values"]
126
+ headers = get_range_headers(headers_metadata, name)
127
+ if headers is None:
128
+ # generate automatic headers and treat the first row as data
129
+ headers = [f"col_{idx+1}" for idx in range(len(headers_metadata))]
130
+ data_row_metadata = headers_metadata
131
+ rows_data = values[0:]
132
+ logger.warning(
133
+ f"Using automatic headers. WARNING: first row of the range {name} will be used as data!"
134
+ )
135
+ else:
136
+ # first row contains headers and is skipped
137
+ data_row_metadata = metadata["rowData"][1]["values"]
138
+ rows_data = values[1:]
139
+
140
+ data_types = get_data_types(data_row_metadata)
141
+
142
+ yield dlt.resource(
143
+ process_range(rows_data, headers=headers, data_types=data_types),
144
+ name=name,
145
+ write_disposition="replace",
146
+ )
147
+ yield dlt.resource(
148
+ metadata_table,
149
+ write_disposition="merge",
150
+ name="spreadsheet_info",
151
+ merge_key="spreadsheet_id",
152
+ )
@@ -0,0 +1 @@
1
+ """Google Sheets source helpers"""
@@ -0,0 +1,146 @@
1
+ """Contains helper functions to extract data from spreadsheet API"""
2
+
3
+ from typing import Any, List, Tuple
4
+
5
+ from dlt.common.exceptions import MissingDependencyException
6
+ from dlt.common.typing import DictStrAny
7
+ from dlt.sources.credentials import GcpCredentials, GcpOAuthCredentials
8
+ from dlt.sources.helpers.requests.retry import DEFAULT_RETRY_STATUS
9
+ from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
10
+
11
+ from .data_processing import ParsedRange, trim_range_top_left
12
+
13
+ try:
14
+ from apiclient.discovery import Resource, build
15
+ except ImportError:
16
+ raise MissingDependencyException("Google API Client", ["google-api-python-client"])
17
+
18
+
19
+ def is_retry_status_code(exception: BaseException) -> bool:
20
+ """Retry condition on HttpError"""
21
+ from googleapiclient.errors import HttpError # type: ignore
22
+
23
+ # print(f"RETRY ON {str(HttpError)} = {isinstance(exception, HttpError) and exception.resp.status in DEFAULT_RETRY_STATUS}")
24
+ # if isinstance(exception, HttpError):
25
+ # print(exception.resp.status)
26
+ # print(DEFAULT_RETRY_STATUS)
27
+ return (
28
+ isinstance(exception, HttpError)
29
+ and exception.resp.status in DEFAULT_RETRY_STATUS
30
+ )
31
+
32
+
33
+ retry_deco = retry(
34
+ # Retry if it's a rate limit error (HTTP 429)
35
+ retry=retry_if_exception(is_retry_status_code),
36
+ # Use exponential backoff for the waiting time between retries, starting with 5 seconds
37
+ wait=wait_exponential(multiplier=1.5, min=5, max=120),
38
+ # Stop retrying after 10 attempts
39
+ stop=stop_after_attempt(10),
40
+ # Print out the retrying details
41
+ reraise=True,
42
+ )
43
+
44
+
45
+ def api_auth(credentials: GcpCredentials, max_api_retries: int) -> Resource:
46
+ """
47
+ Uses GCP credentials to authenticate with Google Sheets API.
48
+
49
+ Args:
50
+ credentials (GcpCredentials): Credentials needed to log in to GCP.
51
+ max_api_retries (int): Max number of retires to google sheets API. Actual behavior is internal to google client.
52
+
53
+ Returns:
54
+ Resource: Object needed to make API calls to Google Sheets API.
55
+ """
56
+ if isinstance(credentials, GcpOAuthCredentials):
57
+ credentials.auth("https://www.googleapis.com/auth/spreadsheets.readonly")
58
+ # Build the service object for Google sheets api.
59
+ service = build(
60
+ "sheets",
61
+ "v4",
62
+ credentials=credentials.to_native_credentials(),
63
+ num_retries=max_api_retries,
64
+ )
65
+ return service
66
+
67
+
68
+ @retry_deco
69
+ def get_meta_for_ranges(
70
+ service: Resource, spreadsheet_id: str, range_names: List[str]
71
+ ) -> Any:
72
+ """Retrieves `spreadsheet_id` cell metadata for `range_names`"""
73
+ return (
74
+ service.spreadsheets()
75
+ .get(
76
+ spreadsheetId=spreadsheet_id,
77
+ ranges=range_names,
78
+ includeGridData=True,
79
+ )
80
+ .execute()
81
+ )
82
+
83
+
84
+ @retry_deco
85
+ def get_known_range_names(
86
+ spreadsheet_id: str, service: Resource
87
+ ) -> Tuple[List[str], List[str], str]:
88
+ """
89
+ Retrieves spreadsheet metadata and extracts a list of sheet names and named ranges
90
+
91
+ Args:
92
+ spreadsheet_id (str): The ID of the spreadsheet.
93
+ service (Resource): Resource object used to make API calls to Google Sheets API.
94
+
95
+ Returns:
96
+ Tuple[List[str], List[str], str] sheet names, named ranges, spreadheet title
97
+ """
98
+ metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
99
+ sheet_names: List[str] = [s["properties"]["title"] for s in metadata["sheets"]]
100
+ named_ranges: List[str] = [r["name"] for r in metadata.get("namedRanges", {})]
101
+ title: str = metadata["properties"]["title"]
102
+ return sheet_names, named_ranges, title
103
+
104
+
105
+ @retry_deco
106
+ def get_data_for_ranges(
107
+ service: Resource, spreadsheet_id: str, range_names: List[str]
108
+ ) -> List[Tuple[str, ParsedRange, ParsedRange, List[List[Any]]]]:
109
+ """
110
+ Calls Google Sheets API to get data in a batch. This is the most efficient way to get data for multiple ranges inside a spreadsheet.
111
+
112
+ Args:
113
+ service (Resource): Object to make API calls to Google Sheets.
114
+ spreadsheet_id (str): The ID of the spreadsheet.
115
+ range_names (List[str]): List of range names.
116
+
117
+ Returns:
118
+ List[DictStrAny]: A list of ranges with data in the same order as `range_names`
119
+ """
120
+ range_batch_resp = (
121
+ service.spreadsheets()
122
+ .values()
123
+ .batchGet(
124
+ spreadsheetId=spreadsheet_id,
125
+ ranges=range_names,
126
+ # un formatted returns typed values
127
+ valueRenderOption="UNFORMATTED_VALUE",
128
+ # will return formatted dates as a serial number
129
+ dateTimeRenderOption="SERIAL_NUMBER",
130
+ )
131
+ .execute()
132
+ )
133
+ # if there are not ranges to be loaded, there's no "valueRanges"
134
+ range_batch: List[DictStrAny] = range_batch_resp.get("valueRanges", [])
135
+ # trim the empty top rows and columns from the left
136
+ rv = []
137
+ for name, range_ in zip(range_names, range_batch):
138
+ parsed_range = ParsedRange.parse_range(range_["range"])
139
+ values: List[List[Any]] = range_.get("values", None)
140
+ if values:
141
+ parsed_range, values = trim_range_top_left(parsed_range, values)
142
+ # create a new range to get first two rows
143
+ meta_range = parsed_range._replace(end_row=parsed_range.start_row + 1)
144
+ # print(f"{name}:{parsed_range}:{meta_range}")
145
+ rv.append((name, parsed_range, meta_range, values))
146
+ return rv
@@ -0,0 +1,302 @@
1
+ """This is a helper module that contains function which validate and process data"""
2
+
3
+ import re
4
+ from typing import Any, Iterator, List, NamedTuple, Tuple, Union
5
+
6
+ import dlt
7
+ from dlt.common import logger, pendulum
8
+ from dlt.common.data_types import TDataType
9
+ from dlt.common.typing import DictStrAny
10
+
11
+ # this string comes before the id
12
+ URL_ID_IDENTIFIER = "d"
13
+ # time info
14
+ SECONDS_PER_DAY = 86400
15
+ # TIMEZONE info
16
+ DLT_TIMEZONE = "UTC"
17
+ # number of seconds from UNIX timestamp origin (1st Jan 1970) to serial number origin (30th Dec 1899)
18
+ TIMESTAMP_CONST = -2209161600.0
19
+ # compiled regex to extract ranges
20
+ RE_PARSE_RANGE = re.compile(
21
+ r"^(?:(?P<sheet>[\'\w\s]+)!)?(?P<start_col>[A-Z]+)(?P<start_row>\d+):(?P<end_col>[A-Z]+)(?P<end_row>\d+)$"
22
+ )
23
+
24
+
25
+ class ParsedRange(NamedTuple):
26
+ sheet_name: str
27
+ start_col: str
28
+ start_row: int
29
+ end_col: str
30
+ end_row: int
31
+
32
+ @classmethod
33
+ def parse_range(cls, s: str) -> "ParsedRange":
34
+ match = RE_PARSE_RANGE.match(s)
35
+ if match:
36
+ parsed_dict = match.groupdict()
37
+ return ParsedRange(
38
+ parsed_dict["sheet"].strip("'"),
39
+ parsed_dict["start_col"],
40
+ int(parsed_dict["start_row"]),
41
+ parsed_dict["end_col"],
42
+ int(parsed_dict["end_row"]),
43
+ )
44
+ else:
45
+ raise ValueError(s)
46
+
47
+ def __str__(self) -> str:
48
+ return f"{self.sheet_name}!{self.start_col}{self.start_row}:{self.end_col}{self.end_row}"
49
+
50
+ @staticmethod
51
+ def shift_column(col: str, shift: int) -> str:
52
+ """
53
+ Shift a Google Sheets column string by a given number of positions.
54
+
55
+ Parameters:
56
+ col (str): The original column string.
57
+ shift (int): The number of positions to shift the column.
58
+
59
+ Returns:
60
+ str: The new column string after shifting.
61
+ """
62
+ # Convert column string to column index (1-indexed)
63
+ col_num = 0
64
+ for i, char in enumerate(reversed(col)):
65
+ col_num += (ord(char.upper()) - 65 + 1) * (26**i)
66
+
67
+ # Shift the column index
68
+ col_num += shift
69
+
70
+ # Convert back to column string
71
+ col_str = ""
72
+ while col_num > 0:
73
+ col_num, remainder = divmod(col_num - 1, 26)
74
+ col_str = chr(65 + remainder) + col_str
75
+
76
+ return col_str
77
+
78
+
79
+ def get_spreadsheet_id(url_or_id: str) -> str:
80
+ """
81
+ Receives an ID or URL to a Google Spreadsheet and returns the spreadsheet ID as a string.
82
+
83
+ Args:
84
+ url_or_id (str): The ID or URL of the spreadsheet.
85
+
86
+ Returns:
87
+ str: The spreadsheet ID as a string.
88
+ """
89
+
90
+ # check if this is an url: http or https in it
91
+ if re.match(r"http://|https://", url_or_id):
92
+ # process url
93
+ spreadsheet_id = extract_spreadsheet_id_from_url(url_or_id)
94
+ return spreadsheet_id
95
+ else:
96
+ # just return id
97
+ return url_or_id
98
+
99
+
100
+ def extract_spreadsheet_id_from_url(url: str) -> str:
101
+ """
102
+ Takes a URL to a Google spreadsheet and computes the spreadsheet ID from it according to the spreadsheet URL formula: https://docs.google.com/spreadsheets/d/<spreadsheet_id>/edit.
103
+ If the URL is not formatted correctly, a ValueError will be raised.
104
+
105
+ Args:
106
+ url (str): The URL to the spreadsheet.
107
+
108
+ Returns:
109
+ str: The spreadsheet ID as a string.
110
+
111
+ Raises:
112
+ ValueError: If the URL is not properly formatted.
113
+ """
114
+
115
+ # split on the '/'
116
+ parts = url.split("/")
117
+ # loop through parts
118
+ for i in range(len(parts)):
119
+ if parts[i] == URL_ID_IDENTIFIER and i + 1 < len(parts):
120
+ # if the id part is left empty then the url is not formatted correctly
121
+ if parts[i + 1] == "":
122
+ raise ValueError(f"Spreadsheet ID is an empty string in url: {url}")
123
+ else:
124
+ return parts[i + 1]
125
+ raise ValueError(f"Invalid URL. Cannot find spreadsheet ID in url: {url}")
126
+
127
+
128
+ def get_range_headers(headers_metadata: List[DictStrAny], range_name: str) -> List[str]:
129
+ """
130
+ Retrieves the headers for columns from the metadata of a range.
131
+
132
+ Args:
133
+ headers_metadata (List[DictStrAny]): Metadata for the first 2 rows of a range.
134
+ range_name (str): The name of the range as appears in the metadata.
135
+
136
+ Returns:
137
+ List[str]: A list of headers.
138
+ """
139
+ headers = []
140
+ for idx, header in enumerate(headers_metadata):
141
+ header_val: str = None
142
+ if header:
143
+ if "stringValue" in header.get("effectiveValue", {}):
144
+ header_val = header["formattedValue"]
145
+ else:
146
+ header_val = header.get("formattedValue", None)
147
+ # if there's no formatted value then the cell is empty (no empty string as well!) in that case add auto name and move on
148
+ if header_val is None:
149
+ header_val = str(f"col_{idx + 1}")
150
+ else:
151
+ logger.warning(
152
+ f"In range {range_name}, header value: {header_val} at position {idx+1} is not a string!"
153
+ )
154
+ return None
155
+ else:
156
+ logger.warning(
157
+ f"In range {range_name}, header at position {idx+1} is not missing!"
158
+ )
159
+ return None
160
+ headers.append(header_val)
161
+
162
+ # make sure that headers are unique, first normalize the headers
163
+ header_mappings = {
164
+ h: dlt.current.source_schema().naming.normalize_identifier(h) for h in headers
165
+ }
166
+ if len(set(header_mappings.values())) != len(headers):
167
+ logger.warning(
168
+ "Header names must be unique otherwise you risk that data in columns with duplicate header names to be lost. Note that several destinations require "
169
+ + "that column names are normalized ie. must be lower or upper case and without special characters. dlt normalizes those names for you but it may "
170
+ + f"result in duplicate column names. Headers in range {range_name} are mapped as follows: "
171
+ + ", ".join([f"{k}->{v}" for k, v in header_mappings.items()])
172
+ + ". Please use make your header names unique."
173
+ )
174
+ return None
175
+
176
+ return headers
177
+
178
+
179
+ def get_data_types(data_row_metadata: List[DictStrAny]) -> List[TDataType]:
180
+ """
181
+ Determines if each column in the first line of a range contains datetime objects.
182
+
183
+ Args:
184
+ data_row_metadata (List[DictStrAny]): Metadata of the first row of data
185
+
186
+ Returns:
187
+ List[TDataType]: "timestamp" or "data" indicating the date/time type for a column, otherwise None
188
+ """
189
+
190
+ # get data for 1st column and process them, if empty just return an empty list
191
+ try:
192
+ data_types: List[TDataType] = [None] * len(data_row_metadata)
193
+ for idx, val_dict in enumerate(data_row_metadata):
194
+ try:
195
+ data_type = val_dict["effectiveFormat"]["numberFormat"]["type"]
196
+ if data_type in ["DATE_TIME", "TIME"]:
197
+ data_types[idx] = "timestamp"
198
+ elif data_type == "DATE":
199
+ data_types[idx] = "date"
200
+ except KeyError:
201
+ pass
202
+ return data_types
203
+ except IndexError:
204
+ return []
205
+
206
+
207
+ def serial_date_to_datetime(
208
+ serial_number: Union[int, float], data_type: TDataType
209
+ ) -> Union[pendulum.DateTime, pendulum.Date]:
210
+ """
211
+ Converts a serial number to a datetime (if input is float) or date (if input is int).
212
+
213
+ Args:
214
+ serial_number (Union[int, float, str, bool]): The Lotus Notes serial number
215
+
216
+ Returns:
217
+ Union[pendulum.DateTime, str, bool]: The converted datetime object, or the original value if conversion fails.
218
+ """
219
+ # To get the seconds passed since the start date of serial numbers we round the product of the number of seconds in a day and the serial number
220
+ conv_datetime: pendulum.DateTime = pendulum.from_timestamp(
221
+ 0, DLT_TIMEZONE
222
+ ) + pendulum.duration(
223
+ seconds=TIMESTAMP_CONST + round(SECONDS_PER_DAY * serial_number)
224
+ )
225
+ # int values are dates, float values are datetimes
226
+ if data_type == "date":
227
+ return conv_datetime.date() # type: ignore[no-any-return]
228
+
229
+ return conv_datetime
230
+
231
+
232
+ def process_range(
233
+ sheet_values: List[List[Any]], headers: List[str], data_types: List[TDataType]
234
+ ) -> Iterator[DictStrAny]:
235
+ """
236
+ Yields lists of values as dictionaries, converts data times and handles empty rows and cells. Please note:
237
+ 1. empty rows get ignored
238
+ 2. empty cells are converted to None (and then to NULL by dlt)
239
+ 3. data in columns without headers will be dropped
240
+
241
+ Args:
242
+ sheet_val (List[List[Any]]): range values without the header row
243
+ headers (List[str]): names of the headers
244
+ data_types: List[TDataType]: "timestamp" and "date" or None for each column
245
+
246
+ Yields:
247
+ DictStrAny: A dictionary version of the table. It generates a dictionary of the type {header: value} for every row.
248
+ """
249
+
250
+ for row in sheet_values:
251
+ # empty row; skip
252
+ if not row:
253
+ continue
254
+ table_dict = {}
255
+ # process both rows and check for differences to spot dates
256
+ for val, header, data_type in zip(row, headers, data_types):
257
+ # 3 main cases: null cell value, datetime value, every other value
258
+ # handle null values properly. Null cell values are returned as empty strings, this will cause dlt to create new columns and fill them with empty strings
259
+ if val == "":
260
+ fill_val = None
261
+ elif data_type in ["timestamp", "date"]:
262
+ # the datetimes are inferred from first row of data. if next rows have inconsistent data types - pass the values to dlt to deal with them
263
+ if not isinstance(val, (int, float)) or isinstance(val, bool):
264
+ fill_val = val
265
+ else:
266
+ fill_val = serial_date_to_datetime(val, data_type)
267
+ else:
268
+ fill_val = val
269
+ table_dict[header] = fill_val
270
+ yield table_dict
271
+
272
+
273
+ def trim_range_top_left(
274
+ parsed_range: ParsedRange, range_values: List[List[Any]]
275
+ ) -> Tuple[ParsedRange, List[List[Any]]]:
276
+ # skip empty rows and then empty columns
277
+ # skip empty rows
278
+ shift_x = 0
279
+ for row in range_values:
280
+ if row:
281
+ break
282
+ else:
283
+ shift_x += 1
284
+ if shift_x > 0:
285
+ range_values = range_values[shift_x:]
286
+ # skip empty columns
287
+ shift_y = 0
288
+ if len(range_values) > 0:
289
+ for col in range_values[0]:
290
+ if col == "":
291
+ shift_y += 1
292
+ else:
293
+ break
294
+ if shift_y > 0:
295
+ # skip all columns
296
+ for idx, row in enumerate(range_values):
297
+ range_values[idx] = row[shift_y:]
298
+ parsed_range = parsed_range._replace(
299
+ start_row=parsed_range.start_row + shift_x,
300
+ start_col=ParsedRange.shift_column(parsed_range.start_col, shift_y),
301
+ )
302
+ return parsed_range, range_values
@@ -1,9 +1,12 @@
1
+ import base64
1
2
  import csv
3
+ import json
2
4
  from typing import Callable
3
5
  from urllib.parse import parse_qs, urlparse
4
6
 
5
7
  import dlt
6
8
 
9
+ from ingestr.src.google_sheets import google_spreadsheet
7
10
  from ingestr.src.mongodb import mongodb_collection
8
11
  from ingestr.src.notion import notion_databases
9
12
  from ingestr.src.sql_database import sql_table
@@ -129,3 +132,46 @@ class NotionSource:
129
132
  database_ids=[{"id": table}],
130
133
  api_key=api_key[0],
131
134
  )
135
+
136
+
137
+ class GoogleSheetsSource:
138
+ table_builder: Callable
139
+
140
+ def __init__(self, table_builder=google_spreadsheet) -> None:
141
+ self.table_builder = table_builder
142
+
143
+ def dlt_source(self, uri: str, table: str, **kwargs):
144
+ if kwargs.get("incremental_key"):
145
+ raise ValueError("Incremental loads are not supported for Google Sheets")
146
+
147
+ source_fields = urlparse(uri)
148
+ source_params = parse_qs(source_fields.query)
149
+
150
+ cred_path = source_params.get("credentials_path")
151
+ credentials_base64 = source_params.get("credentials_base64")
152
+ if not cred_path and not credentials_base64:
153
+ raise ValueError(
154
+ "credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
155
+ )
156
+
157
+ credentials = {}
158
+ if cred_path:
159
+ with open(cred_path[0], "r") as f:
160
+ credentials = json.load(f)
161
+ elif credentials_base64:
162
+ credentials = json.loads(
163
+ base64.b64decode(credentials_base64[0]).decode("utf-8")
164
+ )
165
+
166
+ table_fields = table.split(".")
167
+ if len(table_fields) != 2:
168
+ raise ValueError(
169
+ "Table name must be in the format <spreadsheet_id>.<sheet_name>"
170
+ )
171
+
172
+ return self.table_builder(
173
+ credentials=credentials,
174
+ spreadsheet_url_or_id=table_fields[0],
175
+ range_names=[table_fields[1]],
176
+ get_named_ranges=False,
177
+ )
@@ -7,3 +7,4 @@ from dlt.sources.credentials import ConnectionStringCredentials
7
7
  @configspec(init=False)
8
8
  class IngestrConnectionStringCredentials(ConnectionStringCredentials):
9
9
  username: Optional[str] = None
10
+ database: Optional[str] = None
@@ -0,0 +1 @@
1
+ __version__ = "0.5.1"
@@ -65,12 +65,14 @@ exclude = [
65
65
  'venv',
66
66
  'src/sql_database/.*',
67
67
  'src/mongodb/.*',
68
+ 'src/google_sheets/.*',
68
69
  ]
69
70
 
70
71
  [[tool.mypy.overrides]]
71
72
  module = [
72
73
  "ingestr.src.sql_database.*",
73
74
  "ingestr.src.mongodb.*",
75
+ "ingestr.src.google_sheets.*",
74
76
  ]
75
77
  follow_imports = "skip"
76
78
 
@@ -4,6 +4,7 @@ dlt==0.4.8
4
4
  duckdb_engine==0.11.5
5
5
  duckdb==0.10.2
6
6
  google-cloud-bigquery-storage==2.24.0
7
+ google-api-python-client==2.130.0
7
8
  pendulum==3.0.0
8
9
  psycopg2-binary==2.9.9
9
10
  py-machineid==0.5.1
@@ -1 +0,0 @@
1
- __version__ = "0.4.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes