ingestr 0.6.6__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (94) hide show
  1. ingestr-0.7.0/.github/workflows/tests.yml +36 -0
  2. {ingestr-0.6.6 → ingestr-0.7.0}/Makefile +4 -1
  3. {ingestr-0.6.6 → ingestr-0.7.0}/PKG-INFO +2 -2
  4. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/main.py +29 -4
  5. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/factory.py +2 -0
  6. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/sources.py +1 -3
  7. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/sql_database/__init__.py +62 -28
  8. ingestr-0.7.0/ingestr/src/sql_database/arrow_helpers.py +139 -0
  9. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/sql_database/helpers.py +57 -33
  10. ingestr-0.7.0/ingestr/src/sql_database/schema_types.py +139 -0
  11. ingestr-0.7.0/ingestr/src/version.py +1 -0
  12. {ingestr-0.6.6 → ingestr-0.7.0}/pyproject.toml +6 -0
  13. ingestr-0.7.0/requirements-dev.txt +10 -0
  14. {ingestr-0.6.6 → ingestr-0.7.0}/requirements.txt +2 -2
  15. ingestr-0.6.6/ingestr/main_test.py +0 -875
  16. ingestr-0.6.6/ingestr/src/destinations_test.py +0 -113
  17. ingestr-0.6.6/ingestr/src/factory_test.py +0 -13
  18. ingestr-0.6.6/ingestr/src/gorgias/helpers_test.py +0 -45
  19. ingestr-0.6.6/ingestr/src/sources_test.py +0 -96
  20. ingestr-0.6.6/ingestr/src/sql_database/schema_types.py +0 -162
  21. ingestr-0.6.6/ingestr/src/version.py +0 -1
  22. ingestr-0.6.6/requirements-dev.txt +0 -9
  23. {ingestr-0.6.6 → ingestr-0.7.0}/.dockerignore +0 -0
  24. {ingestr-0.6.6 → ingestr-0.7.0}/.github/workflows/deploy-docs.yml +0 -0
  25. {ingestr-0.6.6 → ingestr-0.7.0}/.github/workflows/docker.yml +0 -0
  26. {ingestr-0.6.6 → ingestr-0.7.0}/.gitignore +0 -0
  27. {ingestr-0.6.6 → ingestr-0.7.0}/.python-version +0 -0
  28. {ingestr-0.6.6 → ingestr-0.7.0}/Dockerfile +0 -0
  29. {ingestr-0.6.6 → ingestr-0.7.0}/LICENSE.md +0 -0
  30. {ingestr-0.6.6 → ingestr-0.7.0}/README.md +0 -0
  31. {ingestr-0.6.6 → ingestr-0.7.0}/docs/.vitepress/config.mjs +0 -0
  32. {ingestr-0.6.6 → ingestr-0.7.0}/docs/.vitepress/theme/custom.css +0 -0
  33. {ingestr-0.6.6 → ingestr-0.7.0}/docs/.vitepress/theme/index.js +0 -0
  34. {ingestr-0.6.6 → ingestr-0.7.0}/docs/commands/example-uris.md +0 -0
  35. {ingestr-0.6.6 → ingestr-0.7.0}/docs/commands/ingest.md +0 -0
  36. {ingestr-0.6.6 → ingestr-0.7.0}/docs/getting-started/core-concepts.md +0 -0
  37. {ingestr-0.6.6 → ingestr-0.7.0}/docs/getting-started/incremental-loading.md +0 -0
  38. {ingestr-0.6.6 → ingestr-0.7.0}/docs/getting-started/quickstart.md +0 -0
  39. {ingestr-0.6.6 → ingestr-0.7.0}/docs/getting-started/telemetry.md +0 -0
  40. {ingestr-0.6.6 → ingestr-0.7.0}/docs/index.md +0 -0
  41. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/bigquery.md +0 -0
  42. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/csv.md +0 -0
  43. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/databricks.md +0 -0
  44. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/duckdb.md +0 -0
  45. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/gorgias.md +0 -0
  46. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/gsheets.md +0 -0
  47. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/mongodb.md +0 -0
  48. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/mssql.md +0 -0
  49. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/mysql.md +0 -0
  50. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/notion.md +0 -0
  51. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/oracle.md +0 -0
  52. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/overview.md +0 -0
  53. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/postgres.md +0 -0
  54. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/redshift.md +0 -0
  55. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/sap-hana.md +0 -0
  56. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/shopify.md +0 -0
  57. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/snowflake.md +0 -0
  58. {ingestr-0.6.6 → ingestr-0.7.0}/docs/supported-sources/sqlite.md +0 -0
  59. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/destinations.py +0 -0
  60. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/google_sheets/README.md +0 -0
  61. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/google_sheets/__init__.py +0 -0
  62. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/google_sheets/helpers/__init__.py +0 -0
  63. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/google_sheets/helpers/api_calls.py +0 -0
  64. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/google_sheets/helpers/data_processing.py +0 -0
  65. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/gorgias/__init__.py +0 -0
  66. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/gorgias/helpers.py +0 -0
  67. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/mongodb/__init__.py +0 -0
  68. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/mongodb/helpers.py +0 -0
  69. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/notion/__init__.py +0 -0
  70. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/notion/helpers/__init__.py +0 -0
  71. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/notion/helpers/client.py +0 -0
  72. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/notion/helpers/database.py +0 -0
  73. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/notion/settings.py +0 -0
  74. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/shopify/__init__.py +0 -0
  75. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/shopify/exceptions.py +0 -0
  76. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/shopify/helpers.py +0 -0
  77. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/shopify/settings.py +0 -0
  78. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/sql_database/override.py +0 -0
  79. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/table_definition.py +0 -0
  80. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/telemetry/event.py +0 -0
  81. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/src/testdata/fakebqcredentials.json +0 -0
  82. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/.gitignore +0 -0
  83. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/create_replace.csv +0 -0
  84. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/delete_insert_expected.csv +0 -0
  85. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/delete_insert_part1.csv +0 -0
  86. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/delete_insert_part2.csv +0 -0
  87. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/merge_expected.csv +0 -0
  88. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/merge_part1.csv +0 -0
  89. {ingestr-0.6.6 → ingestr-0.7.0}/ingestr/testdata/merge_part2.csv +0 -0
  90. {ingestr-0.6.6 → ingestr-0.7.0}/package-lock.json +0 -0
  91. {ingestr-0.6.6 → ingestr-0.7.0}/package.json +0 -0
  92. {ingestr-0.6.6 → ingestr-0.7.0}/resources/demo.gif +0 -0
  93. {ingestr-0.6.6 → ingestr-0.7.0}/resources/demo.tape +0 -0
  94. {ingestr-0.6.6 → ingestr-0.7.0}/resources/ingestr.svg +0 -0
@@ -0,0 +1,36 @@
1
+ on:
2
+ push:
3
+ branches:
4
+ - main
5
+ pull_request:
6
+ branches:
7
+ - main
8
+
9
+ concurrency:
10
+ group: ${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ tests:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - name: install Microsoft ODBC
19
+ run: sudo ACCEPT_EULA=Y apt-get install msodbcsql18 -y
20
+ - uses: actions/setup-python@v4
21
+ with:
22
+ python-version: '3.11'
23
+ cache: 'pip'
24
+ - name: Cache dependencies
25
+ uses: actions/cache@v3
26
+ id: cache
27
+ with:
28
+ path: ${{ env.pythonLocation }}
29
+ key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
30
+ - name: Install pip dependencies
31
+ if: steps.cache.outputs.cache-hit != 'true'
32
+ run: make deps-ci
33
+ - name: run tests
34
+ run: make test-ci
35
+ - name: check the formatting
36
+ run: make lint-ci
@@ -11,6 +11,9 @@ venv/touchfile: requirements-dev.txt requirements.txt
11
11
  deps:
12
12
  uv pip install -r requirements-dev.txt
13
13
 
14
+ deps-ci:
15
+ pip install -r requirements-dev.txt
16
+
14
17
  test-ci:
15
18
  pytest -rP -vv --tb=short --cov=ingestr --no-cov-on-fail
16
19
 
@@ -33,4 +36,4 @@ build:
33
36
  rm -rf dist && python3 -m build
34
37
 
35
38
  upload-release:
36
- twine upload dist/*
39
+ twine upload --verbose dist/*
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ingestr
3
- Version: 0.6.6
3
+ Version: 0.7.0
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -16,7 +16,7 @@ Classifier: Topic :: Database
16
16
  Requires-Python: >=3.9
17
17
  Requires-Dist: cx-oracle==8.3.0
18
18
  Requires-Dist: databricks-sql-connector==2.9.3
19
- Requires-Dist: dlt==0.4.12
19
+ Requires-Dist: dlt==0.5.1
20
20
  Requires-Dist: duckdb-engine==0.11.5
21
21
  Requires-Dist: duckdb==0.10.2
22
22
  Requires-Dist: google-api-python-client==2.130.0
@@ -90,6 +90,7 @@ class IncrementalStrategy(str, Enum):
90
90
  append = "append"
91
91
  delete_insert = "delete+insert"
92
92
  merge = "merge"
93
+ scd2 = "scd2"
93
94
  none = "none"
94
95
 
95
96
 
@@ -208,6 +209,20 @@ def ingest(
208
209
  envvar="LOADER_FILE_FORMAT",
209
210
  ),
210
211
  ] = None, # type: ignore
212
+ page_size: Annotated[
213
+ Optional[int],
214
+ typer.Option(
215
+ help="The page size to be used when fetching data from SQL sources",
216
+ envvar="PAGE_SIZE",
217
+ ),
218
+ ] = 50000, # type: ignore
219
+ loader_file_size: Annotated[
220
+ Optional[int],
221
+ typer.Option(
222
+ help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
223
+ envvar="LOADER_FILE_SIZE",
224
+ ),
225
+ ] = 100000, # type: ignore
211
226
  ):
212
227
  track(
213
228
  "command_triggered",
@@ -216,6 +231,10 @@ def ingest(
216
231
  },
217
232
  )
218
233
 
234
+ dlt.config["normalize.parquet_normalizer.add_dlt_load_id"] = True
235
+ dlt.config["normalize.parquet_normalizer.add_dlt_id"] = True
236
+ dlt.config["data_writer.file_max_items"] = loader_file_size
237
+
219
238
  try:
220
239
  if not dest_table:
221
240
  if len(source_table.split(".")) != 2:
@@ -255,7 +274,7 @@ def ingest(
255
274
  ),
256
275
  progress=progressInstance,
257
276
  pipelines_dir="pipeline_data",
258
- full_refresh=full_refresh,
277
+ refresh="drop_resources" if full_refresh else None,
259
278
  )
260
279
 
261
280
  if source.handles_incrementality():
@@ -282,6 +301,9 @@ def ingest(
282
301
  print(
283
302
  f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
284
303
  )
304
+ print(
305
+ f"[bold yellow] Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
306
+ )
285
307
  print()
286
308
 
287
309
  if not yes:
@@ -304,6 +326,7 @@ def ingest(
304
326
  interval_start=interval_start,
305
327
  interval_end=interval_end,
306
328
  sql_backend=sql_backend.value,
329
+ page_size=page_size,
307
330
  )
308
331
 
309
332
  if original_incremental_strategy == IncrementalStrategy.delete_insert:
@@ -322,15 +345,17 @@ def ingest(
322
345
  ):
323
346
  loader_file_format = None
324
347
 
348
+ write_disposition = None
349
+ if incremental_strategy != IncrementalStrategy.none:
350
+ write_disposition = incremental_strategy.value
351
+
325
352
  run_info: LoadInfo = pipeline.run(
326
353
  dlt_source,
327
354
  **destination.dlt_run_params(
328
355
  uri=dest_uri,
329
356
  table=dest_table,
330
357
  ),
331
- write_disposition=incremental_strategy.value
332
- if incremental_strategy.value != IncrementalStrategy.none
333
- else None, # type: ignore
358
+ write_disposition=write_disposition, # type: ignore
334
359
  primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
335
360
  loader_file_format=loader_file_format.value
336
361
  if loader_file_format is not None
@@ -33,6 +33,7 @@ SQL_SOURCE_SCHEMES = [
33
33
  "mysql+mysqlconnector",
34
34
  "postgres",
35
35
  "postgresql",
36
+ "postgresql+psycopg2",
36
37
  "redshift",
37
38
  "redshift+psycopg2",
38
39
  "snowflake",
@@ -112,6 +113,7 @@ class SourceDestinationFactory:
112
113
  "mssql": MsSQLDestination(),
113
114
  "postgres": PostgresDestination(),
114
115
  "postgresql": PostgresDestination(),
116
+ "postgresql+psycopg2": PostgresDestination(),
115
117
  "redshift": RedshiftDestination(),
116
118
  "redshift+psycopg2": RedshiftDestination(),
117
119
  "redshift+redshift_connector": RedshiftDestination(),
@@ -49,6 +49,7 @@ class SqlSource:
49
49
  incremental=incremental,
50
50
  merge_key=kwargs.get("merge_key"),
51
51
  backend=kwargs.get("sql_backend", "sqlalchemy"),
52
+ chunk_size=kwargs.get("page_size", None),
52
53
  )
53
54
 
54
55
  return table_instance
@@ -122,7 +123,6 @@ class LocalCsvSource:
122
123
  f"incremental_key '{incremental_key}' not found in the CSV file"
123
124
  )
124
125
 
125
- print("BURAYA GELLDIII")
126
126
  if inc_value < incremental.start_value:
127
127
  continue
128
128
 
@@ -183,8 +183,6 @@ class ShopifySource:
183
183
  "Shopify takes care of incrementality on its own, you should not provide incremental_key"
184
184
  )
185
185
 
186
- # shopify://shop_url?api_key=private_app_password
187
-
188
186
  source_fields = urlparse(uri)
189
187
  source_params = parse_qs(source_fields.query)
190
188
  api_key = source_params.get("api_key")
@@ -3,7 +3,6 @@
3
3
  from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
4
 
5
5
  import dlt
6
- from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
7
6
  from dlt.sources import DltResource
8
7
  from sqlalchemy import MetaData, Table
9
8
  from sqlalchemy.engine import Engine
@@ -12,35 +11,41 @@ from .helpers import (
12
11
  SqlDatabaseTableConfiguration,
13
12
  SqlTableResourceConfiguration,
14
13
  TableBackend,
14
+ _detect_precision_hints_deprecated,
15
15
  engine_from_credentials,
16
- get_primary_key,
17
16
  table_rows,
18
17
  )
19
- from .override import IngestrConnectionStringCredentials
20
- from .schema_types import table_to_columns
18
+ from .override import IngestrConnectionStringCredentials as ConnectionStringCredentials
19
+ from .schema_types import (
20
+ ReflectionLevel,
21
+ TTypeAdapter,
22
+ get_primary_key,
23
+ table_to_columns,
24
+ )
21
25
 
22
26
 
23
27
  @dlt.source
24
28
  def sql_database(
25
- credentials: Union[
26
- IngestrConnectionStringCredentials, Engine, str
27
- ] = dlt.secrets.value,
29
+ credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
28
30
  schema: Optional[str] = dlt.config.value,
29
31
  metadata: Optional[MetaData] = None,
30
32
  table_names: Optional[List[str]] = dlt.config.value,
31
33
  chunk_size: int = 50000,
32
34
  backend: TableBackend = "sqlalchemy",
33
- detect_precision_hints: Optional[bool] = dlt.config.value,
34
- defer_table_reflect: Optional[bool] = dlt.config.value,
35
+ detect_precision_hints: Optional[bool] = False,
36
+ reflection_level: Optional[ReflectionLevel] = "full",
37
+ defer_table_reflect: Optional[bool] = None,
35
38
  table_adapter_callback: Callable[[Table], None] = None,
36
39
  backend_kwargs: Dict[str, Any] = None,
40
+ include_views: bool = False,
41
+ type_adapter_callback: Optional[TTypeAdapter] = None,
37
42
  ) -> Iterable[DltResource]:
38
43
  """
39
44
  A dlt source which loads data from an SQL database using SQLAlchemy.
40
45
  Resources are automatically created for each table in the schema or from the given list of tables.
41
46
 
42
47
  Args:
43
- credentials (Union[IngestrConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
48
+ credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
44
49
  schema (Optional[str]): Name of the database schema to load (if different from default).
45
50
  metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
46
51
  table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
@@ -49,15 +54,30 @@ def sql_database(
49
54
  "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
50
55
  "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
51
56
  "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
52
- detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
57
+ detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
53
58
  This is disabled by default.
59
+ reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
60
+ "minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
61
+ "full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
62
+ "full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
54
63
  defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
55
64
  Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
56
65
  table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
57
66
  backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
67
+ include_views (bool): Reflect views as well as tables. Note view names included in `table_names` are always included regardless of this setting.
68
+ type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
69
+ Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
58
70
  Returns:
71
+
59
72
  Iterable[DltResource]: A list of DLT resources for each table to be loaded.
60
73
  """
74
+ # detect precision hints is deprecated
75
+ _detect_precision_hints_deprecated(detect_precision_hints)
76
+
77
+ if detect_precision_hints:
78
+ reflection_level = "full_with_precision"
79
+ else:
80
+ reflection_level = reflection_level or "minimal"
61
81
 
62
82
  # set up alchemy engine
63
83
  engine = engine_from_credentials(credentials)
@@ -73,49 +93,49 @@ def sql_database(
73
93
  else:
74
94
  if defer_table_reflect:
75
95
  raise ValueError("You must pass table names to defer table reflection")
76
- metadata.reflect(bind=engine)
96
+ metadata.reflect(bind=engine, views=include_views)
77
97
  tables = list(metadata.tables.values())
78
98
 
79
99
  for table in tables:
80
100
  if table_adapter_callback and not defer_table_reflect:
81
101
  table_adapter_callback(table)
102
+
82
103
  yield dlt.resource(
83
104
  table_rows,
84
105
  name=table.name,
85
106
  primary_key=get_primary_key(table),
86
107
  spec=SqlDatabaseTableConfiguration,
87
- columns=table_to_columns(table, detect_precision_hints),
108
+ columns=table_to_columns(table, reflection_level, type_adapter_callback),
88
109
  )(
89
110
  engine,
90
111
  table,
91
112
  chunk_size,
92
113
  backend,
93
- detect_precision_hints=detect_precision_hints,
114
+ reflection_level=reflection_level,
94
115
  defer_table_reflect=defer_table_reflect,
95
116
  table_adapter_callback=table_adapter_callback,
96
117
  backend_kwargs=backend_kwargs,
118
+ type_adapter_callback=type_adapter_callback,
97
119
  )
98
120
 
99
121
 
100
- @dlt.sources.config.with_config(
101
- sections=("sources", "sql_database"),
102
- spec=SqlTableResourceConfiguration,
103
- sections_merge_style=ConfigSectionContext.resource_merge_style,
122
+ @dlt.resource(
123
+ name=lambda args: args["table"], standalone=True, spec=SqlTableResourceConfiguration
104
124
  )
105
125
  def sql_table(
106
- credentials: Union[
107
- IngestrConnectionStringCredentials, Engine, str
108
- ] = dlt.secrets.value,
126
+ credentials: Union[ConnectionStringCredentials, Engine, str] = str,
109
127
  table: str = dlt.config.value,
110
128
  schema: Optional[str] = dlt.config.value,
111
129
  metadata: Optional[MetaData] = None,
112
130
  incremental: Optional[dlt.sources.incremental[Any]] = None,
113
- chunk_size: int = 1000,
131
+ chunk_size: int = 50000,
114
132
  backend: TableBackend = "sqlalchemy",
115
- detect_precision_hints: Optional[bool] = dlt.config.value,
116
- defer_table_reflect: Optional[bool] = dlt.config.value,
133
+ detect_precision_hints: Optional[bool] = None,
134
+ reflection_level: Optional[ReflectionLevel] = "full_with_precision",
135
+ defer_table_reflect: Optional[bool] = None,
117
136
  table_adapter_callback: Callable[[Table], None] = None,
118
137
  backend_kwargs: Dict[str, Any] = None,
138
+ type_adapter_callback: Optional[TTypeAdapter] = None,
119
139
  merge_key: Optional[str] = None,
120
140
  ) -> DltResource:
121
141
  """
@@ -123,7 +143,7 @@ def sql_table(
123
143
 
124
144
  Args:
125
145
  credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `Engine` instance representing the database connection.
126
- table (str): Name of the table to load.
146
+ table (str): Name of the table or view to load.
127
147
  schema (Optional[str]): Optional name of the schema the table belongs to.
128
148
  metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
129
149
  incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
@@ -133,16 +153,29 @@ def sql_table(
133
153
  "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
134
154
  "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
135
155
  "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
136
- detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
156
+ reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
157
+ "minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
158
+ "full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
159
+ "full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
160
+ detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
137
161
  This is disabled by default.
138
162
  defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
139
163
  on dlt 0.4.4 and later
140
164
  table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
141
165
  backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
166
+ type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
167
+ Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
142
168
 
143
169
  Returns:
144
170
  DltResource: The dlt resource for loading data from the SQL database table.
145
171
  """
172
+ _detect_precision_hints_deprecated(detect_precision_hints)
173
+
174
+ if detect_precision_hints:
175
+ reflection_level = "full_with_precision"
176
+ else:
177
+ reflection_level = reflection_level or "minimal"
178
+
146
179
  engine = engine_from_credentials(credentials)
147
180
  engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
148
181
  metadata = metadata or MetaData(schema=schema)
@@ -157,7 +190,7 @@ def sql_table(
157
190
  table_rows,
158
191
  name=table_obj.name,
159
192
  primary_key=get_primary_key(table_obj),
160
- columns=table_to_columns(table_obj, detect_precision_hints),
193
+ columns=table_to_columns(table_obj, reflection_level, type_adapter_callback),
161
194
  merge_key=merge_key,
162
195
  )(
163
196
  engine,
@@ -165,8 +198,9 @@ def sql_table(
165
198
  chunk_size,
166
199
  backend,
167
200
  incremental=incremental,
168
- detect_precision_hints=detect_precision_hints,
201
+ reflection_level=reflection_level,
169
202
  defer_table_reflect=defer_table_reflect,
170
203
  table_adapter_callback=table_adapter_callback,
171
204
  backend_kwargs=backend_kwargs,
205
+ type_adapter_callback=type_adapter_callback,
172
206
  )
@@ -0,0 +1,139 @@
1
+ from typing import Any, Optional, Sequence
2
+
3
+ from dlt.common import logger
4
+ from dlt.common.configuration import with_config
5
+ from dlt.common.destination import DestinationCapabilitiesContext
6
+ from dlt.common.json import custom_encode, map_nested_in_place
7
+ from dlt.common.schema.typing import TTableSchemaColumns
8
+
9
+ from .schema_types import RowAny
10
+
11
+
12
+ @with_config
13
+ def columns_to_arrow(
14
+ columns_schema: TTableSchemaColumns,
15
+ caps: DestinationCapabilitiesContext = None,
16
+ tz: str = "UTC",
17
+ ) -> Any:
18
+ """Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
19
+ is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
20
+ Otherwise generic capabilities are used
21
+ """
22
+ from dlt.common.destination.capabilities import DestinationCapabilitiesContext
23
+ from dlt.common.libs.pyarrow import get_py_arrow_datatype
24
+ from dlt.common.libs.pyarrow import pyarrow as pa
25
+
26
+ return pa.schema(
27
+ [
28
+ pa.field(
29
+ name,
30
+ get_py_arrow_datatype(
31
+ schema_item,
32
+ caps or DestinationCapabilitiesContext.generic_capabilities(),
33
+ tz,
34
+ ),
35
+ nullable=schema_item.get("nullable", True),
36
+ )
37
+ for name, schema_item in columns_schema.items()
38
+ if schema_item.get("data_type") is not None
39
+ ]
40
+ )
41
+
42
+
43
+ def row_tuples_to_arrow(
44
+ rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
45
+ ) -> Any:
46
+ """Converts the rows to an arrow table using the columns schema.
47
+ Columns missing `data_type` will be inferred from the row data.
48
+ Columns with object types not supported by arrow are excluded from the resulting table.
49
+ """
50
+ import numpy as np
51
+ from dlt.common.libs.pyarrow import pyarrow as pa
52
+
53
+ try:
54
+ from pandas._libs import lib
55
+
56
+ pivoted_rows = lib.to_object_array_tuples(rows).T # type: ignore[attr-defined]
57
+ except ImportError:
58
+ logger.info(
59
+ "Pandas not installed, reverting to numpy.asarray to create a table which is slower"
60
+ )
61
+ pivoted_rows = np.asarray(rows, dtype="object", order="k").T # type: ignore[call-overload]
62
+
63
+ columnar = {
64
+ col: dat.ravel()
65
+ for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
66
+ }
67
+ columnar_known_types = {
68
+ col["name"]: columnar[col["name"]]
69
+ for col in columns.values()
70
+ if col.get("data_type") is not None
71
+ }
72
+ columnar_unknown_types = {
73
+ col["name"]: columnar[col["name"]]
74
+ for col in columns.values()
75
+ if col.get("data_type") is None
76
+ }
77
+
78
+ arrow_schema = columns_to_arrow(columns, tz=tz)
79
+
80
+ for idx in range(0, len(arrow_schema.names)):
81
+ field = arrow_schema.field(idx)
82
+ py_type = type(rows[0][idx])
83
+ # cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
84
+ if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
85
+ logger.warning(
86
+ f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
87
+ )
88
+ float_array = pa.array(columnar_known_types[field.name], type=pa.float64())
89
+ columnar_known_types[field.name] = float_array.cast(field.type, safe=False)
90
+
91
+ # If there are unknown type columns, first create a table to infer their types
92
+ if columnar_unknown_types:
93
+ new_schema_fields = []
94
+ for key in list(columnar_unknown_types):
95
+ arrow_col: Optional[pa.Array] = None
96
+ try:
97
+ arrow_col = pa.array(columnar_unknown_types[key])
98
+ if pa.types.is_null(arrow_col.type):
99
+ logger.warning(
100
+ f"Column {key} contains only NULL values and data type could not be inferred. This column is removed from a arrow table"
101
+ )
102
+ continue
103
+
104
+ except pa.ArrowInvalid as e:
105
+ # Try coercing types not supported by arrow to a json friendly format
106
+ # E.g. dataclasses -> dict, UUID -> str
107
+ try:
108
+ arrow_col = pa.array(
109
+ map_nested_in_place(
110
+ custom_encode, list(columnar_unknown_types[key])
111
+ )
112
+ )
113
+ logger.warning(
114
+ f"Column {key} contains a data type which is not supported by pyarrow and got converted into {arrow_col.type}. This slows down arrow table generation."
115
+ )
116
+ except (pa.ArrowInvalid, TypeError):
117
+ logger.warning(
118
+ f"Column {key} contains a data type which is not supported by pyarrow. This column will be ignored. Error: {e}"
119
+ )
120
+ if arrow_col is not None:
121
+ columnar_known_types[key] = arrow_col
122
+ new_schema_fields.append(
123
+ pa.field(
124
+ key,
125
+ arrow_col.type,
126
+ nullable=columns[key]["nullable"],
127
+ )
128
+ )
129
+
130
+ # New schema
131
+ column_order = {name: idx for idx, name in enumerate(columns)}
132
+ arrow_schema = pa.schema(
133
+ sorted(
134
+ list(arrow_schema) + new_schema_fields,
135
+ key=lambda x: column_order[x.name],
136
+ )
137
+ )
138
+
139
+ return pa.Table.from_pydict(columnar_known_types, schema=arrow_schema)