ingestr 0.6.5__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr-0.7.0/.github/workflows/tests.yml +36 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/Makefile +4 -1
- {ingestr-0.6.5 → ingestr-0.7.0}/PKG-INFO +8 -2
- {ingestr-0.6.5 → ingestr-0.7.0}/README.md +5 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/overview.md +5 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/main.py +29 -4
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/factory.py +3 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/sources.py +11 -21
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/sql_database/__init__.py +62 -28
- ingestr-0.7.0/ingestr/src/sql_database/arrow_helpers.py +139 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/sql_database/helpers.py +57 -33
- ingestr-0.7.0/ingestr/src/sql_database/schema_types.py +139 -0
- ingestr-0.7.0/ingestr/src/table_definition.py +15 -0
- ingestr-0.7.0/ingestr/src/version.py +1 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/pyproject.toml +6 -0
- ingestr-0.7.0/requirements-dev.txt +10 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/requirements.txt +2 -1
- ingestr-0.6.5/ingestr/main_test.py +0 -875
- ingestr-0.6.5/ingestr/src/destinations_test.py +0 -113
- ingestr-0.6.5/ingestr/src/factory_test.py +0 -13
- ingestr-0.6.5/ingestr/src/gorgias/helpers_test.py +0 -45
- ingestr-0.6.5/ingestr/src/sources_test.py +0 -104
- ingestr-0.6.5/ingestr/src/sql_database/schema_types.py +0 -162
- ingestr-0.6.5/ingestr/src/version.py +0 -1
- ingestr-0.6.5/requirements-dev.txt +0 -9
- {ingestr-0.6.5 → ingestr-0.7.0}/.dockerignore +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/.github/workflows/docker.yml +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/.gitignore +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/.python-version +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/Dockerfile +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/LICENSE.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/.vitepress/config.mjs +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/commands/example-uris.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/commands/ingest.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/getting-started/quickstart.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/index.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/gorgias.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/gsheets.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/mongodb.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/notion.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/sap-hana.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/shopify.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/snowflake.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/destinations.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/google_sheets/README.md +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/google_sheets/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/google_sheets/helpers/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/google_sheets/helpers/api_calls.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/google_sheets/helpers/data_processing.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/gorgias/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/gorgias/helpers.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/mongodb/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/mongodb/helpers.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/notion/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/notion/helpers/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/notion/helpers/client.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/notion/helpers/database.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/notion/settings.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/shopify/__init__.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/shopify/exceptions.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/shopify/helpers.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/shopify/settings.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/sql_database/override.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/create_replace.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/delete_insert_expected.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/delete_insert_part1.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/delete_insert_part2.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/merge_expected.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/merge_part1.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/ingestr/testdata/merge_part2.csv +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/package-lock.json +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/package.json +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/resources/demo.gif +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/resources/demo.tape +0 -0
- {ingestr-0.6.5 → ingestr-0.7.0}/resources/ingestr.svg +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
on:
|
|
2
|
+
push:
|
|
3
|
+
branches:
|
|
4
|
+
- main
|
|
5
|
+
pull_request:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
tests:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- name: install Microsoft ODBC
|
|
19
|
+
run: sudo ACCEPT_EULA=Y apt-get install msodbcsql18 -y
|
|
20
|
+
- uses: actions/setup-python@v4
|
|
21
|
+
with:
|
|
22
|
+
python-version: '3.11'
|
|
23
|
+
cache: 'pip'
|
|
24
|
+
- name: Cache dependencies
|
|
25
|
+
uses: actions/cache@v3
|
|
26
|
+
id: cache
|
|
27
|
+
with:
|
|
28
|
+
path: ${{ env.pythonLocation }}
|
|
29
|
+
key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
|
|
30
|
+
- name: Install pip dependencies
|
|
31
|
+
if: steps.cache.outputs.cache-hit != 'true'
|
|
32
|
+
run: make deps-ci
|
|
33
|
+
- name: run tests
|
|
34
|
+
run: make test-ci
|
|
35
|
+
- name: check the formatting
|
|
36
|
+
run: make lint-ci
|
|
@@ -11,6 +11,9 @@ venv/touchfile: requirements-dev.txt requirements.txt
|
|
|
11
11
|
deps:
|
|
12
12
|
uv pip install -r requirements-dev.txt
|
|
13
13
|
|
|
14
|
+
deps-ci:
|
|
15
|
+
pip install -r requirements-dev.txt
|
|
16
|
+
|
|
14
17
|
test-ci:
|
|
15
18
|
pytest -rP -vv --tb=short --cov=ingestr --no-cov-on-fail
|
|
16
19
|
|
|
@@ -33,4 +36,4 @@ build:
|
|
|
33
36
|
rm -rf dist && python3 -m build
|
|
34
37
|
|
|
35
38
|
upload-release:
|
|
36
|
-
twine upload dist/*
|
|
39
|
+
twine upload --verbose dist/*
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -16,11 +16,12 @@ Classifier: Topic :: Database
|
|
|
16
16
|
Requires-Python: >=3.9
|
|
17
17
|
Requires-Dist: cx-oracle==8.3.0
|
|
18
18
|
Requires-Dist: databricks-sql-connector==2.9.3
|
|
19
|
-
Requires-Dist: dlt==0.
|
|
19
|
+
Requires-Dist: dlt==0.5.1
|
|
20
20
|
Requires-Dist: duckdb-engine==0.11.5
|
|
21
21
|
Requires-Dist: duckdb==0.10.2
|
|
22
22
|
Requires-Dist: google-api-python-client==2.130.0
|
|
23
23
|
Requires-Dist: google-cloud-bigquery-storage==2.24.0
|
|
24
|
+
Requires-Dist: mysql-connector-python==9.0.0
|
|
24
25
|
Requires-Dist: pendulum==3.0.0
|
|
25
26
|
Requires-Dist: psycopg2-binary==2.9.9
|
|
26
27
|
Requires-Dist: py-machineid==0.5.1
|
|
@@ -171,6 +172,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
171
172
|
<tr>
|
|
172
173
|
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
173
174
|
</tr>
|
|
175
|
+
<tr>
|
|
176
|
+
<td>Gorgias</td>
|
|
177
|
+
<td>✅</td>
|
|
178
|
+
<td>❌</td>
|
|
179
|
+
</tr>
|
|
174
180
|
<tr>
|
|
175
181
|
<td>Google Sheets</td>
|
|
176
182
|
<td>✅</td>
|
|
@@ -128,6 +128,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
128
128
|
<tr>
|
|
129
129
|
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
130
130
|
</tr>
|
|
131
|
+
<tr>
|
|
132
|
+
<td>Gorgias</td>
|
|
133
|
+
<td>✅</td>
|
|
134
|
+
<td>❌</td>
|
|
135
|
+
</tr>
|
|
131
136
|
<tr>
|
|
132
137
|
<td>Google Sheets</td>
|
|
133
138
|
<td>✅</td>
|
|
@@ -79,6 +79,11 @@ ingestr supports the following sources and destinations:
|
|
|
79
79
|
<tr>
|
|
80
80
|
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
81
81
|
</tr>
|
|
82
|
+
<tr>
|
|
83
|
+
<td>Gorgias</td>
|
|
84
|
+
<td>✅</td>
|
|
85
|
+
<td>❌</td>
|
|
86
|
+
</tr>
|
|
82
87
|
<tr>
|
|
83
88
|
<td>Google Sheets</td>
|
|
84
89
|
<td>✅</td>
|
|
@@ -90,6 +90,7 @@ class IncrementalStrategy(str, Enum):
|
|
|
90
90
|
append = "append"
|
|
91
91
|
delete_insert = "delete+insert"
|
|
92
92
|
merge = "merge"
|
|
93
|
+
scd2 = "scd2"
|
|
93
94
|
none = "none"
|
|
94
95
|
|
|
95
96
|
|
|
@@ -208,6 +209,20 @@ def ingest(
|
|
|
208
209
|
envvar="LOADER_FILE_FORMAT",
|
|
209
210
|
),
|
|
210
211
|
] = None, # type: ignore
|
|
212
|
+
page_size: Annotated[
|
|
213
|
+
Optional[int],
|
|
214
|
+
typer.Option(
|
|
215
|
+
help="The page size to be used when fetching data from SQL sources",
|
|
216
|
+
envvar="PAGE_SIZE",
|
|
217
|
+
),
|
|
218
|
+
] = 50000, # type: ignore
|
|
219
|
+
loader_file_size: Annotated[
|
|
220
|
+
Optional[int],
|
|
221
|
+
typer.Option(
|
|
222
|
+
help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
|
|
223
|
+
envvar="LOADER_FILE_SIZE",
|
|
224
|
+
),
|
|
225
|
+
] = 100000, # type: ignore
|
|
211
226
|
):
|
|
212
227
|
track(
|
|
213
228
|
"command_triggered",
|
|
@@ -216,6 +231,10 @@ def ingest(
|
|
|
216
231
|
},
|
|
217
232
|
)
|
|
218
233
|
|
|
234
|
+
dlt.config["normalize.parquet_normalizer.add_dlt_load_id"] = True
|
|
235
|
+
dlt.config["normalize.parquet_normalizer.add_dlt_id"] = True
|
|
236
|
+
dlt.config["data_writer.file_max_items"] = loader_file_size
|
|
237
|
+
|
|
219
238
|
try:
|
|
220
239
|
if not dest_table:
|
|
221
240
|
if len(source_table.split(".")) != 2:
|
|
@@ -255,7 +274,7 @@ def ingest(
|
|
|
255
274
|
),
|
|
256
275
|
progress=progressInstance,
|
|
257
276
|
pipelines_dir="pipeline_data",
|
|
258
|
-
|
|
277
|
+
refresh="drop_resources" if full_refresh else None,
|
|
259
278
|
)
|
|
260
279
|
|
|
261
280
|
if source.handles_incrementality():
|
|
@@ -282,6 +301,9 @@ def ingest(
|
|
|
282
301
|
print(
|
|
283
302
|
f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
|
|
284
303
|
)
|
|
304
|
+
print(
|
|
305
|
+
f"[bold yellow] Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
|
|
306
|
+
)
|
|
285
307
|
print()
|
|
286
308
|
|
|
287
309
|
if not yes:
|
|
@@ -304,6 +326,7 @@ def ingest(
|
|
|
304
326
|
interval_start=interval_start,
|
|
305
327
|
interval_end=interval_end,
|
|
306
328
|
sql_backend=sql_backend.value,
|
|
329
|
+
page_size=page_size,
|
|
307
330
|
)
|
|
308
331
|
|
|
309
332
|
if original_incremental_strategy == IncrementalStrategy.delete_insert:
|
|
@@ -322,15 +345,17 @@ def ingest(
|
|
|
322
345
|
):
|
|
323
346
|
loader_file_format = None
|
|
324
347
|
|
|
348
|
+
write_disposition = None
|
|
349
|
+
if incremental_strategy != IncrementalStrategy.none:
|
|
350
|
+
write_disposition = incremental_strategy.value
|
|
351
|
+
|
|
325
352
|
run_info: LoadInfo = pipeline.run(
|
|
326
353
|
dlt_source,
|
|
327
354
|
**destination.dlt_run_params(
|
|
328
355
|
uri=dest_uri,
|
|
329
356
|
table=dest_table,
|
|
330
357
|
),
|
|
331
|
-
write_disposition=
|
|
332
|
-
if incremental_strategy.value != IncrementalStrategy.none
|
|
333
|
-
else None, # type: ignore
|
|
358
|
+
write_disposition=write_disposition, # type: ignore
|
|
334
359
|
primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
|
|
335
360
|
loader_file_format=loader_file_format.value
|
|
336
361
|
if loader_file_format is not None
|
|
@@ -30,8 +30,10 @@ SQL_SOURCE_SCHEMES = [
|
|
|
30
30
|
"mssql",
|
|
31
31
|
"mysql",
|
|
32
32
|
"mysql+pymysql",
|
|
33
|
+
"mysql+mysqlconnector",
|
|
33
34
|
"postgres",
|
|
34
35
|
"postgresql",
|
|
36
|
+
"postgresql+psycopg2",
|
|
35
37
|
"redshift",
|
|
36
38
|
"redshift+psycopg2",
|
|
37
39
|
"snowflake",
|
|
@@ -111,6 +113,7 @@ class SourceDestinationFactory:
|
|
|
111
113
|
"mssql": MsSQLDestination(),
|
|
112
114
|
"postgres": PostgresDestination(),
|
|
113
115
|
"postgresql": PostgresDestination(),
|
|
116
|
+
"postgresql+psycopg2": PostgresDestination(),
|
|
114
117
|
"redshift": RedshiftDestination(),
|
|
115
118
|
"redshift+psycopg2": RedshiftDestination(),
|
|
116
119
|
"redshift+redshift_connector": RedshiftDestination(),
|
|
@@ -12,6 +12,7 @@ from ingestr.src.mongodb import mongodb_collection
|
|
|
12
12
|
from ingestr.src.notion import notion_databases
|
|
13
13
|
from ingestr.src.shopify import shopify_source
|
|
14
14
|
from ingestr.src.sql_database import sql_table
|
|
15
|
+
from ingestr.src.table_definition import table_string_to_dataclass
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class SqlSource:
|
|
@@ -24,9 +25,7 @@ class SqlSource:
|
|
|
24
25
|
return False
|
|
25
26
|
|
|
26
27
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
27
|
-
table_fields = table
|
|
28
|
-
if len(table_fields) != 2:
|
|
29
|
-
raise ValueError("Table name must be in the format schema.table")
|
|
28
|
+
table_fields = table_string_to_dataclass(table)
|
|
30
29
|
|
|
31
30
|
incremental = None
|
|
32
31
|
if kwargs.get("incremental_key"):
|
|
@@ -45,11 +44,12 @@ class SqlSource:
|
|
|
45
44
|
|
|
46
45
|
table_instance = self.table_builder(
|
|
47
46
|
credentials=uri,
|
|
48
|
-
schema=table_fields
|
|
49
|
-
table=table_fields
|
|
47
|
+
schema=table_fields.dataset,
|
|
48
|
+
table=table_fields.table,
|
|
50
49
|
incremental=incremental,
|
|
51
50
|
merge_key=kwargs.get("merge_key"),
|
|
52
51
|
backend=kwargs.get("sql_backend", "sqlalchemy"),
|
|
52
|
+
chunk_size=kwargs.get("page_size", None),
|
|
53
53
|
)
|
|
54
54
|
|
|
55
55
|
return table_instance
|
|
@@ -65,9 +65,7 @@ class MongoDbSource:
|
|
|
65
65
|
return False
|
|
66
66
|
|
|
67
67
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
68
|
-
table_fields = table
|
|
69
|
-
if len(table_fields) != 2:
|
|
70
|
-
raise ValueError("Table name must be in the format schema.table")
|
|
68
|
+
table_fields = table_string_to_dataclass(table)
|
|
71
69
|
|
|
72
70
|
incremental = None
|
|
73
71
|
if kwargs.get("incremental_key"):
|
|
@@ -82,8 +80,8 @@ class MongoDbSource:
|
|
|
82
80
|
|
|
83
81
|
table_instance = self.table_builder(
|
|
84
82
|
connection_url=uri,
|
|
85
|
-
database=table_fields
|
|
86
|
-
collection=table_fields
|
|
83
|
+
database=table_fields.dataset,
|
|
84
|
+
collection=table_fields.table,
|
|
87
85
|
parallel=True,
|
|
88
86
|
incremental=incremental,
|
|
89
87
|
)
|
|
@@ -125,7 +123,6 @@ class LocalCsvSource:
|
|
|
125
123
|
f"incremental_key '{incremental_key}' not found in the CSV file"
|
|
126
124
|
)
|
|
127
125
|
|
|
128
|
-
print("BURAYA GELLDIII")
|
|
129
126
|
if inc_value < incremental.start_value:
|
|
130
127
|
continue
|
|
131
128
|
|
|
@@ -186,8 +183,6 @@ class ShopifySource:
|
|
|
186
183
|
"Shopify takes care of incrementality on its own, you should not provide incremental_key"
|
|
187
184
|
)
|
|
188
185
|
|
|
189
|
-
# shopify://shop_url?api_key=private_app_password
|
|
190
|
-
|
|
191
186
|
source_fields = urlparse(uri)
|
|
192
187
|
source_params = parse_qs(source_fields.query)
|
|
193
188
|
api_key = source_params.get("api_key")
|
|
@@ -293,15 +288,10 @@ class GoogleSheetsSource:
|
|
|
293
288
|
base64.b64decode(credentials_base64[0]).decode("utf-8")
|
|
294
289
|
)
|
|
295
290
|
|
|
296
|
-
table_fields = table
|
|
297
|
-
if len(table_fields) != 2:
|
|
298
|
-
raise ValueError(
|
|
299
|
-
"Table name must be in the format <spreadsheet_id>.<sheet_name>"
|
|
300
|
-
)
|
|
301
|
-
|
|
291
|
+
table_fields = table_string_to_dataclass(table)
|
|
302
292
|
return self.table_builder(
|
|
303
293
|
credentials=credentials,
|
|
304
|
-
spreadsheet_url_or_id=table_fields
|
|
305
|
-
range_names=[table_fields
|
|
294
|
+
spreadsheet_url_or_id=table_fields.table,
|
|
295
|
+
range_names=[table_fields.dataset],
|
|
306
296
|
get_named_ranges=False,
|
|
307
297
|
)
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
import dlt
|
|
6
|
-
from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
|
|
7
6
|
from dlt.sources import DltResource
|
|
8
7
|
from sqlalchemy import MetaData, Table
|
|
9
8
|
from sqlalchemy.engine import Engine
|
|
@@ -12,35 +11,41 @@ from .helpers import (
|
|
|
12
11
|
SqlDatabaseTableConfiguration,
|
|
13
12
|
SqlTableResourceConfiguration,
|
|
14
13
|
TableBackend,
|
|
14
|
+
_detect_precision_hints_deprecated,
|
|
15
15
|
engine_from_credentials,
|
|
16
|
-
get_primary_key,
|
|
17
16
|
table_rows,
|
|
18
17
|
)
|
|
19
|
-
from .override import IngestrConnectionStringCredentials
|
|
20
|
-
from .schema_types import
|
|
18
|
+
from .override import IngestrConnectionStringCredentials as ConnectionStringCredentials
|
|
19
|
+
from .schema_types import (
|
|
20
|
+
ReflectionLevel,
|
|
21
|
+
TTypeAdapter,
|
|
22
|
+
get_primary_key,
|
|
23
|
+
table_to_columns,
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
|
|
23
27
|
@dlt.source
|
|
24
28
|
def sql_database(
|
|
25
|
-
credentials: Union[
|
|
26
|
-
IngestrConnectionStringCredentials, Engine, str
|
|
27
|
-
] = dlt.secrets.value,
|
|
29
|
+
credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
|
|
28
30
|
schema: Optional[str] = dlt.config.value,
|
|
29
31
|
metadata: Optional[MetaData] = None,
|
|
30
32
|
table_names: Optional[List[str]] = dlt.config.value,
|
|
31
33
|
chunk_size: int = 50000,
|
|
32
34
|
backend: TableBackend = "sqlalchemy",
|
|
33
|
-
detect_precision_hints: Optional[bool] =
|
|
34
|
-
|
|
35
|
+
detect_precision_hints: Optional[bool] = False,
|
|
36
|
+
reflection_level: Optional[ReflectionLevel] = "full",
|
|
37
|
+
defer_table_reflect: Optional[bool] = None,
|
|
35
38
|
table_adapter_callback: Callable[[Table], None] = None,
|
|
36
39
|
backend_kwargs: Dict[str, Any] = None,
|
|
40
|
+
include_views: bool = False,
|
|
41
|
+
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
37
42
|
) -> Iterable[DltResource]:
|
|
38
43
|
"""
|
|
39
44
|
A dlt source which loads data from an SQL database using SQLAlchemy.
|
|
40
45
|
Resources are automatically created for each table in the schema or from the given list of tables.
|
|
41
46
|
|
|
42
47
|
Args:
|
|
43
|
-
credentials (Union[
|
|
48
|
+
credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
|
|
44
49
|
schema (Optional[str]): Name of the database schema to load (if different from default).
|
|
45
50
|
metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
|
|
46
51
|
table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
|
|
@@ -49,15 +54,30 @@ def sql_database(
|
|
|
49
54
|
"sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
|
|
50
55
|
"sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
|
|
51
56
|
"connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
|
|
52
|
-
detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
57
|
+
detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
53
58
|
This is disabled by default.
|
|
59
|
+
reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
|
|
60
|
+
"minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
|
|
61
|
+
"full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
|
|
62
|
+
"full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
|
|
54
63
|
defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
|
|
55
64
|
Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
|
|
56
65
|
table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
|
|
57
66
|
backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
|
|
67
|
+
include_views (bool): Reflect views as well as tables. Note view names included in `table_names` are always included regardless of this setting.
|
|
68
|
+
type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
|
|
69
|
+
Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
|
|
58
70
|
Returns:
|
|
71
|
+
|
|
59
72
|
Iterable[DltResource]: A list of DLT resources for each table to be loaded.
|
|
60
73
|
"""
|
|
74
|
+
# detect precision hints is deprecated
|
|
75
|
+
_detect_precision_hints_deprecated(detect_precision_hints)
|
|
76
|
+
|
|
77
|
+
if detect_precision_hints:
|
|
78
|
+
reflection_level = "full_with_precision"
|
|
79
|
+
else:
|
|
80
|
+
reflection_level = reflection_level or "minimal"
|
|
61
81
|
|
|
62
82
|
# set up alchemy engine
|
|
63
83
|
engine = engine_from_credentials(credentials)
|
|
@@ -73,49 +93,49 @@ def sql_database(
|
|
|
73
93
|
else:
|
|
74
94
|
if defer_table_reflect:
|
|
75
95
|
raise ValueError("You must pass table names to defer table reflection")
|
|
76
|
-
metadata.reflect(bind=engine)
|
|
96
|
+
metadata.reflect(bind=engine, views=include_views)
|
|
77
97
|
tables = list(metadata.tables.values())
|
|
78
98
|
|
|
79
99
|
for table in tables:
|
|
80
100
|
if table_adapter_callback and not defer_table_reflect:
|
|
81
101
|
table_adapter_callback(table)
|
|
102
|
+
|
|
82
103
|
yield dlt.resource(
|
|
83
104
|
table_rows,
|
|
84
105
|
name=table.name,
|
|
85
106
|
primary_key=get_primary_key(table),
|
|
86
107
|
spec=SqlDatabaseTableConfiguration,
|
|
87
|
-
columns=table_to_columns(table,
|
|
108
|
+
columns=table_to_columns(table, reflection_level, type_adapter_callback),
|
|
88
109
|
)(
|
|
89
110
|
engine,
|
|
90
111
|
table,
|
|
91
112
|
chunk_size,
|
|
92
113
|
backend,
|
|
93
|
-
|
|
114
|
+
reflection_level=reflection_level,
|
|
94
115
|
defer_table_reflect=defer_table_reflect,
|
|
95
116
|
table_adapter_callback=table_adapter_callback,
|
|
96
117
|
backend_kwargs=backend_kwargs,
|
|
118
|
+
type_adapter_callback=type_adapter_callback,
|
|
97
119
|
)
|
|
98
120
|
|
|
99
121
|
|
|
100
|
-
@dlt.
|
|
101
|
-
|
|
102
|
-
spec=SqlTableResourceConfiguration,
|
|
103
|
-
sections_merge_style=ConfigSectionContext.resource_merge_style,
|
|
122
|
+
@dlt.resource(
|
|
123
|
+
name=lambda args: args["table"], standalone=True, spec=SqlTableResourceConfiguration
|
|
104
124
|
)
|
|
105
125
|
def sql_table(
|
|
106
|
-
credentials: Union[
|
|
107
|
-
IngestrConnectionStringCredentials, Engine, str
|
|
108
|
-
] = dlt.secrets.value,
|
|
126
|
+
credentials: Union[ConnectionStringCredentials, Engine, str] = str,
|
|
109
127
|
table: str = dlt.config.value,
|
|
110
128
|
schema: Optional[str] = dlt.config.value,
|
|
111
129
|
metadata: Optional[MetaData] = None,
|
|
112
130
|
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
113
|
-
chunk_size: int =
|
|
131
|
+
chunk_size: int = 50000,
|
|
114
132
|
backend: TableBackend = "sqlalchemy",
|
|
115
|
-
detect_precision_hints: Optional[bool] =
|
|
116
|
-
|
|
133
|
+
detect_precision_hints: Optional[bool] = None,
|
|
134
|
+
reflection_level: Optional[ReflectionLevel] = "full_with_precision",
|
|
135
|
+
defer_table_reflect: Optional[bool] = None,
|
|
117
136
|
table_adapter_callback: Callable[[Table], None] = None,
|
|
118
137
|
backend_kwargs: Dict[str, Any] = None,
|
|
138
|
+
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
119
139
|
merge_key: Optional[str] = None,
|
|
120
140
|
) -> DltResource:
|
|
121
141
|
"""
|
|
@@ -123,7 +143,7 @@ def sql_table(
|
|
|
123
143
|
|
|
124
144
|
Args:
|
|
125
145
|
credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `Engine` instance representing the database connection.
|
|
126
|
-
table (str): Name of the table to load.
|
|
146
|
+
table (str): Name of the table or view to load.
|
|
127
147
|
schema (Optional[str]): Optional name of the schema the table belongs to.
|
|
128
148
|
metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
|
|
129
149
|
incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
|
|
@@ -133,16 +153,29 @@ def sql_table(
|
|
|
133
153
|
"sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
|
|
134
154
|
"sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
|
|
135
155
|
"connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
|
|
136
|
-
|
|
156
|
+
reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
|
|
157
|
+
"minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
|
|
158
|
+
"full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
|
|
159
|
+
"full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
|
|
160
|
+
detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
137
161
|
This is disabled by default.
|
|
138
162
|
defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
|
|
139
163
|
on dlt 0.4.4 and later
|
|
140
164
|
table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
|
|
141
165
|
backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
|
|
166
|
+
type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
|
|
167
|
+
Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
|
|
142
168
|
|
|
143
169
|
Returns:
|
|
144
170
|
DltResource: The dlt resource for loading data from the SQL database table.
|
|
145
171
|
"""
|
|
172
|
+
_detect_precision_hints_deprecated(detect_precision_hints)
|
|
173
|
+
|
|
174
|
+
if detect_precision_hints:
|
|
175
|
+
reflection_level = "full_with_precision"
|
|
176
|
+
else:
|
|
177
|
+
reflection_level = reflection_level or "minimal"
|
|
178
|
+
|
|
146
179
|
engine = engine_from_credentials(credentials)
|
|
147
180
|
engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
|
|
148
181
|
metadata = metadata or MetaData(schema=schema)
|
|
@@ -157,7 +190,7 @@ def sql_table(
|
|
|
157
190
|
table_rows,
|
|
158
191
|
name=table_obj.name,
|
|
159
192
|
primary_key=get_primary_key(table_obj),
|
|
160
|
-
columns=table_to_columns(table_obj,
|
|
193
|
+
columns=table_to_columns(table_obj, reflection_level, type_adapter_callback),
|
|
161
194
|
merge_key=merge_key,
|
|
162
195
|
)(
|
|
163
196
|
engine,
|
|
@@ -165,8 +198,9 @@ def sql_table(
|
|
|
165
198
|
chunk_size,
|
|
166
199
|
backend,
|
|
167
200
|
incremental=incremental,
|
|
168
|
-
|
|
201
|
+
reflection_level=reflection_level,
|
|
169
202
|
defer_table_reflect=defer_table_reflect,
|
|
170
203
|
table_adapter_callback=table_adapter_callback,
|
|
171
204
|
backend_kwargs=backend_kwargs,
|
|
205
|
+
type_adapter_callback=type_adapter_callback,
|
|
172
206
|
)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from typing import Any, Optional, Sequence
|
|
2
|
+
|
|
3
|
+
from dlt.common import logger
|
|
4
|
+
from dlt.common.configuration import with_config
|
|
5
|
+
from dlt.common.destination import DestinationCapabilitiesContext
|
|
6
|
+
from dlt.common.json import custom_encode, map_nested_in_place
|
|
7
|
+
from dlt.common.schema.typing import TTableSchemaColumns
|
|
8
|
+
|
|
9
|
+
from .schema_types import RowAny
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@with_config
|
|
13
|
+
def columns_to_arrow(
|
|
14
|
+
columns_schema: TTableSchemaColumns,
|
|
15
|
+
caps: DestinationCapabilitiesContext = None,
|
|
16
|
+
tz: str = "UTC",
|
|
17
|
+
) -> Any:
|
|
18
|
+
"""Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
|
|
19
|
+
is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
|
|
20
|
+
Otherwise generic capabilities are used
|
|
21
|
+
"""
|
|
22
|
+
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
|
|
23
|
+
from dlt.common.libs.pyarrow import get_py_arrow_datatype
|
|
24
|
+
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
25
|
+
|
|
26
|
+
return pa.schema(
|
|
27
|
+
[
|
|
28
|
+
pa.field(
|
|
29
|
+
name,
|
|
30
|
+
get_py_arrow_datatype(
|
|
31
|
+
schema_item,
|
|
32
|
+
caps or DestinationCapabilitiesContext.generic_capabilities(),
|
|
33
|
+
tz,
|
|
34
|
+
),
|
|
35
|
+
nullable=schema_item.get("nullable", True),
|
|
36
|
+
)
|
|
37
|
+
for name, schema_item in columns_schema.items()
|
|
38
|
+
if schema_item.get("data_type") is not None
|
|
39
|
+
]
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def row_tuples_to_arrow(
|
|
44
|
+
rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
|
|
45
|
+
) -> Any:
|
|
46
|
+
"""Converts the rows to an arrow table using the columns schema.
|
|
47
|
+
Columns missing `data_type` will be inferred from the row data.
|
|
48
|
+
Columns with object types not supported by arrow are excluded from the resulting table.
|
|
49
|
+
"""
|
|
50
|
+
import numpy as np
|
|
51
|
+
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from pandas._libs import lib
|
|
55
|
+
|
|
56
|
+
pivoted_rows = lib.to_object_array_tuples(rows).T # type: ignore[attr-defined]
|
|
57
|
+
except ImportError:
|
|
58
|
+
logger.info(
|
|
59
|
+
"Pandas not installed, reverting to numpy.asarray to create a table which is slower"
|
|
60
|
+
)
|
|
61
|
+
pivoted_rows = np.asarray(rows, dtype="object", order="k").T # type: ignore[call-overload]
|
|
62
|
+
|
|
63
|
+
columnar = {
|
|
64
|
+
col: dat.ravel()
|
|
65
|
+
for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
|
|
66
|
+
}
|
|
67
|
+
columnar_known_types = {
|
|
68
|
+
col["name"]: columnar[col["name"]]
|
|
69
|
+
for col in columns.values()
|
|
70
|
+
if col.get("data_type") is not None
|
|
71
|
+
}
|
|
72
|
+
columnar_unknown_types = {
|
|
73
|
+
col["name"]: columnar[col["name"]]
|
|
74
|
+
for col in columns.values()
|
|
75
|
+
if col.get("data_type") is None
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
arrow_schema = columns_to_arrow(columns, tz=tz)
|
|
79
|
+
|
|
80
|
+
for idx in range(0, len(arrow_schema.names)):
|
|
81
|
+
field = arrow_schema.field(idx)
|
|
82
|
+
py_type = type(rows[0][idx])
|
|
83
|
+
# cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
|
|
84
|
+
if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
|
|
85
|
+
logger.warning(
|
|
86
|
+
f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
|
|
87
|
+
)
|
|
88
|
+
float_array = pa.array(columnar_known_types[field.name], type=pa.float64())
|
|
89
|
+
columnar_known_types[field.name] = float_array.cast(field.type, safe=False)
|
|
90
|
+
|
|
91
|
+
# If there are unknown type columns, first create a table to infer their types
|
|
92
|
+
if columnar_unknown_types:
|
|
93
|
+
new_schema_fields = []
|
|
94
|
+
for key in list(columnar_unknown_types):
|
|
95
|
+
arrow_col: Optional[pa.Array] = None
|
|
96
|
+
try:
|
|
97
|
+
arrow_col = pa.array(columnar_unknown_types[key])
|
|
98
|
+
if pa.types.is_null(arrow_col.type):
|
|
99
|
+
logger.warning(
|
|
100
|
+
f"Column {key} contains only NULL values and data type could not be inferred. This column is removed from a arrow table"
|
|
101
|
+
)
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
except pa.ArrowInvalid as e:
|
|
105
|
+
# Try coercing types not supported by arrow to a json friendly format
|
|
106
|
+
# E.g. dataclasses -> dict, UUID -> str
|
|
107
|
+
try:
|
|
108
|
+
arrow_col = pa.array(
|
|
109
|
+
map_nested_in_place(
|
|
110
|
+
custom_encode, list(columnar_unknown_types[key])
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
logger.warning(
|
|
114
|
+
f"Column {key} contains a data type which is not supported by pyarrow and got converted into {arrow_col.type}. This slows down arrow table generation."
|
|
115
|
+
)
|
|
116
|
+
except (pa.ArrowInvalid, TypeError):
|
|
117
|
+
logger.warning(
|
|
118
|
+
f"Column {key} contains a data type which is not supported by pyarrow. This column will be ignored. Error: {e}"
|
|
119
|
+
)
|
|
120
|
+
if arrow_col is not None:
|
|
121
|
+
columnar_known_types[key] = arrow_col
|
|
122
|
+
new_schema_fields.append(
|
|
123
|
+
pa.field(
|
|
124
|
+
key,
|
|
125
|
+
arrow_col.type,
|
|
126
|
+
nullable=columns[key]["nullable"],
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# New schema
|
|
131
|
+
column_order = {name: idx for idx, name in enumerate(columns)}
|
|
132
|
+
arrow_schema = pa.schema(
|
|
133
|
+
sorted(
|
|
134
|
+
list(arrow_schema) + new_schema_fields,
|
|
135
|
+
key=lambda x: column_order[x.name],
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return pa.Table.from_pydict(columnar_known_types, schema=arrow_schema)
|