ingestr 0.9.5__py3-none-any.whl → 0.10.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/sources.py CHANGED
@@ -10,6 +10,8 @@ import pendulum
10
10
  from dlt.common.configuration.specs import AwsCredentials
11
11
  from dlt.common.time import ensure_pendulum_datetime
12
12
  from dlt.common.typing import TSecretStrValue
13
+ from dlt.sources.credentials import ConnectionStringCredentials
14
+ from dlt.sources.sql_database import sql_table
13
15
 
14
16
  from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
15
17
  from ingestr.src.adjust.adjust_helpers import parse_filters
@@ -29,7 +31,6 @@ from ingestr.src.mongodb import mongodb_collection
29
31
  from ingestr.src.notion import notion_databases
30
32
  from ingestr.src.shopify import shopify_source
31
33
  from ingestr.src.slack import slack_source
32
- from ingestr.src.sql_database import sql_table
33
34
  from ingestr.src.stripe_analytics import stripe_source
34
35
  from ingestr.src.table_definition import table_string_to_dataclass
35
36
  from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
@@ -67,11 +68,10 @@ class SqlSource:
67
68
  uri = uri.replace("mysql://", "mysql+pymysql://")
68
69
 
69
70
  table_instance = self.table_builder(
70
- credentials=uri,
71
+ credentials=ConnectionStringCredentials(uri),
71
72
  schema=table_fields.dataset,
72
73
  table=table_fields.table,
73
74
  incremental=incremental,
74
- merge_key=kwargs.get("merge_key"),
75
75
  backend=kwargs.get("sql_backend", "sqlalchemy"),
76
76
  chunk_size=kwargs.get("page_size", None),
77
77
  )
ingestr/src/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.5"
1
+ __version__ = "0.10.0rc0"
@@ -280,8 +280,8 @@ def zendesk_support(
280
280
  primary_key="id",
281
281
  write_disposition="merge",
282
282
  columns={
283
- "tags": {"data_type": "complex"},
284
- "custom_fields": {"data_type": "complex"},
283
+ "tags": {"data_type": "json"},
284
+ "custom_fields": {"data_type": "json"},
285
285
  },
286
286
  )
287
287
  def ticket_table(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ingestr
3
- Version: 0.9.5
3
+ Version: 0.10.0rc0
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -13,35 +13,35 @@ Classifier: Operating System :: OS Independent
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Topic :: Database
15
15
  Requires-Python: >=3.9
16
- Requires-Dist: confluent-kafka>=2.3.0
16
+ Requires-Dist: confluent-kafka>=2.6.1
17
17
  Requires-Dist: databricks-sql-connector==2.9.3
18
- Requires-Dist: dlt==0.5.1
19
- Requires-Dist: duckdb-engine==0.11.5
20
- Requires-Dist: duckdb==0.10.2
18
+ Requires-Dist: dlt==1.4.0
19
+ Requires-Dist: duckdb-engine==0.13.5
20
+ Requires-Dist: duckdb==1.1.3
21
21
  Requires-Dist: facebook-business==20.0.0
22
22
  Requires-Dist: google-api-python-client==2.130.0
23
23
  Requires-Dist: google-cloud-bigquery-storage==2.24.0
24
- Requires-Dist: mysql-connector-python==9.0.0
24
+ Requires-Dist: mysql-connector-python==9.1.0
25
25
  Requires-Dist: pendulum==3.0.0
26
- Requires-Dist: psycopg2-binary==2.9.9
27
- Requires-Dist: py-machineid==0.5.1
26
+ Requires-Dist: psycopg2-binary==2.9.10
27
+ Requires-Dist: py-machineid==0.6.0
28
28
  Requires-Dist: pyairtable==2.3.3
29
- Requires-Dist: pymongo==4.6.3
30
- Requires-Dist: pymysql==1.1.0
31
- Requires-Dist: pyrate-limiter==3.6.1
32
- Requires-Dist: redshift-connector==2.1.0
33
- Requires-Dist: rich==13.7.1
34
- Requires-Dist: rudder-sdk-python==2.1.0
29
+ Requires-Dist: pymongo==4.10.1
30
+ Requires-Dist: pymysql==1.1.1
31
+ Requires-Dist: pyrate-limiter==3.7.0
32
+ Requires-Dist: redshift-connector==2.1.3
33
+ Requires-Dist: rich==13.9.4
34
+ Requires-Dist: rudder-sdk-python==2.1.4
35
35
  Requires-Dist: s3fs==2024.9.0
36
- Requires-Dist: snowflake-sqlalchemy==1.5.3
37
- Requires-Dist: sqlalchemy-bigquery==1.11.0
36
+ Requires-Dist: snowflake-sqlalchemy==1.6.1
37
+ Requires-Dist: sqlalchemy-bigquery==1.12.0
38
38
  Requires-Dist: sqlalchemy-hana==2.0.0
39
39
  Requires-Dist: sqlalchemy-redshift==0.8.14
40
40
  Requires-Dist: sqlalchemy2-stubs==0.0.2a38
41
41
  Requires-Dist: sqlalchemy==1.4.52
42
42
  Requires-Dist: stripe==10.7.0
43
- Requires-Dist: tqdm==4.66.2
44
- Requires-Dist: typer==0.12.3
43
+ Requires-Dist: tqdm==4.67.0
44
+ Requires-Dist: typer==0.13.1
45
45
  Requires-Dist: types-requests==2.32.0.20240907
46
46
  Provides-Extra: odbc
47
47
  Requires-Dist: pyodbc==5.1.0; extra == 'odbc'
@@ -1,10 +1,10 @@
1
- ingestr/main.py,sha256=B8TAQotJoYSvmaQQm33o2lv99OVLYNz-1Aw_fgQahwE,17718
1
+ ingestr/main.py,sha256=O0FnyiWBjWu-Inl56EVtdHVWOggTdeEgjKbfp7zT1Kc,20100
2
2
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
3
3
  ingestr/src/destinations.py,sha256=2SfPMjtTelPmzQmc3zNs8xGcKIPuGn_hoZFIBUuhjXI,6338
4
4
  ingestr/src/factory.py,sha256=ft81B-YJgvEROkHAZjMjTIS7IYvle-uZQv45b7-Wfk0,4947
5
- ingestr/src/sources.py,sha256=mG-HPlmPRZQcxChHbIZ0V0bH1EHeko0hbFWnQOcBYRo,33759
5
+ ingestr/src/sources.py,sha256=t_t4xK-bC1dBf7w1axcopuzIERzPHIHZ_QLiEb7BOhQ,33805
6
6
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
7
- ingestr/src/version.py,sha256=ORAtCCI2THBDcdzIbh6oBsoshDvkkmXUWpmO4Q5McAk,22
7
+ ingestr/src/version.py,sha256=xTnym0lQwATiKdBCgZ51qs6XA-BvYKGOI8vHCckrPzk,26
8
8
  ingestr/src/adjust/__init__.py,sha256=oTM7XozDcMuUiCZ0w4gWEBXuCCtMZ0iBfkKdd2pVa1E,3007
9
9
  ingestr/src/adjust/adjust_helpers.py,sha256=-tmmxy9k3wms-ZEIgxmlp2cAQ2X_O1lgjY1128bbMu4,3224
10
10
  ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
@@ -26,7 +26,7 @@ ingestr/src/google_sheets/__init__.py,sha256=5qlX-6ilx5MW7klC7B_0jGSxloQSLkSESTh
26
26
  ingestr/src/google_sheets/helpers/__init__.py,sha256=5hXZrZK8cMO3UOuL-s4OKOpdACdihQD0hYYlSEu-iQ8,35
27
27
  ingestr/src/google_sheets/helpers/api_calls.py,sha256=RiVfdacbaneszhmuhYilkJnkc9kowZvQUCUxz0G6SlI,5404
28
28
  ingestr/src/google_sheets/helpers/data_processing.py,sha256=WYO6z4XjGcG0Hat2J2enb-eLX5mSNVb2vaqRE83FBWU,11000
29
- ingestr/src/gorgias/__init__.py,sha256=BzX9X1Yc_1Mch6NP1pn26hjRIiaadErgHxkdJHw4P3o,21227
29
+ ingestr/src/gorgias/__init__.py,sha256=LZ3m6aGuhLVI3eNjvQE0rT4o_wbSPkY_SDKsM-g0V5U,21176
30
30
  ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOINE,5427
31
31
  ingestr/src/hubspot/__init__.py,sha256=eSD_lEIEd16YijAtUATFG8FGO8YGPm-MtAk94KKsx6o,9740
32
32
  ingestr/src/hubspot/helpers.py,sha256=PTn-UHJv1ENIvA5azUTaHCmFXgmHLJC1tUatQ1N-KFE,6727
@@ -43,24 +43,19 @@ ingestr/src/notion/settings.py,sha256=MwQVZViJtnvOegfjXYc_pJ50oUYgSRPgwqu7TvpeMO
43
43
  ingestr/src/notion/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  ingestr/src/notion/helpers/client.py,sha256=QXuudkf5Zzff98HRsCqA1g1EZWIrnfn1falPrnKg_y4,5500
45
45
  ingestr/src/notion/helpers/database.py,sha256=gigPibTeVefP3lA-8w4aOwX67pj7RlciPk5koDs1ry8,2737
46
- ingestr/src/shopify/__init__.py,sha256=LWO5hfPfwPrq-gZwSRHThDfLHbGo2wkm6CrN1BCm3CQ,62815
46
+ ingestr/src/shopify/__init__.py,sha256=Hhv84zRfVsqAGP7pz-PmeopeX9CGu7TXSm3PSXHEwIA,62689
47
47
  ingestr/src/shopify/exceptions.py,sha256=BhV3lIVWeBt8Eh4CWGW_REFJpGCzvW6-62yZrBWa3nQ,50
48
48
  ingestr/src/shopify/helpers.py,sha256=NfHD6lWXe88ybR0ri-FCQuh2Vf8l5WG0a0FVjmdoSC4,6296
49
49
  ingestr/src/shopify/settings.py,sha256=StY0EPr7wFJ7KzRRDN4TKxV0_gkIS1wPj2eR4AYSsDk,141
50
- ingestr/src/slack/__init__.py,sha256=UfUhkS6FnCKJeXkkJ5QrmdT5nZm5czjtomsQu_x9WUM,9987
50
+ ingestr/src/slack/__init__.py,sha256=UF-ficQ6K32u1EHytW3P35suACo9wuc6nMrAPViyZL8,9981
51
51
  ingestr/src/slack/helpers.py,sha256=08TLK7vhFvH_uekdLVOLF3bTDe1zgH0QxHObXHzk1a8,6545
52
52
  ingestr/src/slack/settings.py,sha256=NhKn4y1zokEa5EmIZ05wtj_-I0GOASXZ5V81M1zXCtY,457
53
- ingestr/src/sql_database/__init__.py,sha256=HEqY6U-YzzbeZ8avIthj-Fatm2C3i3jqYs5DAIAu4Ss,11511
54
- ingestr/src/sql_database/arrow_helpers.py,sha256=yze1X3A9nUQA4HeuFDDWrfJVkCq8Uo5UyDo_zhJtI60,5699
55
- ingestr/src/sql_database/helpers.py,sha256=6o8e2_8MIuj3qlo40a2E6ns3gyK18ei1jCePONrMUjI,10191
56
- ingestr/src/sql_database/override.py,sha256=xbKGDztCzvrhJ5kJTXERal3LA56bEeVug4_rrTs8DgA,333
57
- ingestr/src/sql_database/schema_types.py,sha256=qXTanvFPE8wMCSDzQWPDi5yqaO-llfrFXjiGJALI4NA,5013
58
53
  ingestr/src/stripe_analytics/__init__.py,sha256=8yy6i4DAhUqY4ZForetQ0DWc_YQrY0FBH6yk0Z3m-Mw,4493
59
54
  ingestr/src/stripe_analytics/helpers.py,sha256=iqZOyiGIOhOAhVXXU16DP0hkkTKcTrDu69vAJoTxgEo,1976
60
55
  ingestr/src/stripe_analytics/settings.py,sha256=rl9L5XumxO0pjkZf7MGesXHp4QLRgnz3RWLuDWDBKXo,380
61
56
  ingestr/src/telemetry/event.py,sha256=MpWc5tt0lSJ1pWKe9HQ11BHrcPBxSH40l4wjZi9u0tI,924
62
57
  ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
63
- ingestr/src/zendesk/__init__.py,sha256=FjsDhzKaK7RgFs9zIGjMHHl8TlfYRN3TKocbGiJklD4,17570
58
+ ingestr/src/zendesk/__init__.py,sha256=C7HkN195DGdOHId2_Sa_kAlcBrUmnVYZUa_tPkiyf1Q,17564
64
59
  ingestr/src/zendesk/settings.py,sha256=Vdj706nTJFQ-3KH4nO97iYCQuba3dV3E9gfnmLK6xwU,2294
65
60
  ingestr/src/zendesk/helpers/__init__.py,sha256=YTJejCiUjfIcsj9FrkY0l-JGYDI7RRte1Ydq5FDH_0c,888
66
61
  ingestr/src/zendesk/helpers/api_helpers.py,sha256=dMkNn4ZQXgJTDOXAAXdmRt41phNFoRhYyPaLJih0pZY,4184
@@ -74,8 +69,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
74
69
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
75
70
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
76
71
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
77
- ingestr-0.9.5.dist-info/METADATA,sha256=y1GtmywpIUCyRQrqjAJtD8Qv9gFD7LYm25m1kkKpqxU,7058
78
- ingestr-0.9.5.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
79
- ingestr-0.9.5.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
80
- ingestr-0.9.5.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
81
- ingestr-0.9.5.dist-info/RECORD,,
72
+ ingestr-0.10.0rc0.dist-info/METADATA,sha256=cRvsEF34onQqsvTFbiaU9w3mb6digQL1iLG4BOi3HfQ,7063
73
+ ingestr-0.10.0rc0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
74
+ ingestr-0.10.0rc0.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
75
+ ingestr-0.10.0rc0.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
76
+ ingestr-0.10.0rc0.dist-info/RECORD,,
@@ -1,206 +0,0 @@
1
- """Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads."""
2
-
3
- from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
-
5
- import dlt
6
- from dlt.sources import DltResource
7
- from sqlalchemy import MetaData, Table
8
- from sqlalchemy.engine import Engine
9
-
10
- from .helpers import (
11
- SqlDatabaseTableConfiguration,
12
- SqlTableResourceConfiguration,
13
- TableBackend,
14
- _detect_precision_hints_deprecated,
15
- engine_from_credentials,
16
- table_rows,
17
- )
18
- from .override import IngestrConnectionStringCredentials as ConnectionStringCredentials
19
- from .schema_types import (
20
- ReflectionLevel,
21
- TTypeAdapter,
22
- get_primary_key,
23
- table_to_columns,
24
- )
25
-
26
-
27
- @dlt.source
28
- def sql_database(
29
- credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
30
- schema: Optional[str] = dlt.config.value,
31
- metadata: Optional[MetaData] = None,
32
- table_names: Optional[List[str]] = dlt.config.value,
33
- chunk_size: int = 50000,
34
- backend: TableBackend = "sqlalchemy",
35
- detect_precision_hints: Optional[bool] = False,
36
- reflection_level: Optional[ReflectionLevel] = "full",
37
- defer_table_reflect: Optional[bool] = None,
38
- table_adapter_callback: Callable[[Table], None] = None,
39
- backend_kwargs: Dict[str, Any] = None,
40
- include_views: bool = False,
41
- type_adapter_callback: Optional[TTypeAdapter] = None,
42
- ) -> Iterable[DltResource]:
43
- """
44
- A dlt source which loads data from an SQL database using SQLAlchemy.
45
- Resources are automatically created for each table in the schema or from the given list of tables.
46
-
47
- Args:
48
- credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
49
- schema (Optional[str]): Name of the database schema to load (if different from default).
50
- metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
51
- table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
52
- chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
53
- backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
54
- "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
55
- "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
56
- "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
57
- detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
58
- This is disabled by default.
59
- reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
60
- "minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
61
- "full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
62
- "full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
63
- defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
64
- Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
65
- table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
66
- backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
67
- include_views (bool): Reflect views as well as tables. Note view names included in `table_names` are always included regardless of this setting.
68
- type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
69
- Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
70
- Returns:
71
-
72
- Iterable[DltResource]: A list of DLT resources for each table to be loaded.
73
- """
74
- # detect precision hints is deprecated
75
- _detect_precision_hints_deprecated(detect_precision_hints)
76
-
77
- if detect_precision_hints:
78
- reflection_level = "full_with_precision"
79
- else:
80
- reflection_level = reflection_level or "minimal"
81
-
82
- # set up alchemy engine
83
- engine = engine_from_credentials(credentials)
84
- engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
85
- metadata = metadata or MetaData(schema=schema)
86
-
87
- # use provided tables or all tables
88
- if table_names:
89
- tables = [
90
- Table(name, metadata, autoload_with=None if defer_table_reflect else engine)
91
- for name in table_names
92
- ]
93
- else:
94
- if defer_table_reflect:
95
- raise ValueError("You must pass table names to defer table reflection")
96
- metadata.reflect(bind=engine, views=include_views)
97
- tables = list(metadata.tables.values())
98
-
99
- for table in tables:
100
- if table_adapter_callback and not defer_table_reflect:
101
- table_adapter_callback(table)
102
-
103
- yield dlt.resource(
104
- table_rows,
105
- name=table.name,
106
- primary_key=get_primary_key(table),
107
- spec=SqlDatabaseTableConfiguration,
108
- columns=table_to_columns(table, reflection_level, type_adapter_callback),
109
- )(
110
- engine,
111
- table,
112
- chunk_size,
113
- backend,
114
- reflection_level=reflection_level,
115
- defer_table_reflect=defer_table_reflect,
116
- table_adapter_callback=table_adapter_callback,
117
- backend_kwargs=backend_kwargs,
118
- type_adapter_callback=type_adapter_callback,
119
- )
120
-
121
-
122
- @dlt.resource(
123
- name=lambda args: args["table"], standalone=True, spec=SqlTableResourceConfiguration
124
- )
125
- def sql_table(
126
- credentials: Union[ConnectionStringCredentials, Engine, str] = str,
127
- table: str = dlt.config.value,
128
- schema: Optional[str] = dlt.config.value,
129
- metadata: Optional[MetaData] = None,
130
- incremental: Optional[dlt.sources.incremental[Any]] = None,
131
- chunk_size: int = 50000,
132
- backend: TableBackend = "sqlalchemy",
133
- detect_precision_hints: Optional[bool] = None,
134
- reflection_level: Optional[ReflectionLevel] = "full_with_precision",
135
- defer_table_reflect: Optional[bool] = None,
136
- table_adapter_callback: Callable[[Table], None] = None,
137
- backend_kwargs: Dict[str, Any] = None,
138
- type_adapter_callback: Optional[TTypeAdapter] = None,
139
- merge_key: Optional[str] = None,
140
- ) -> DltResource:
141
- """
142
- A dlt resource which loads data from an SQL database table using SQLAlchemy.
143
-
144
- Args:
145
- credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `Engine` instance representing the database connection.
146
- table (str): Name of the table or view to load.
147
- schema (Optional[str]): Optional name of the schema the table belongs to.
148
- metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
149
- incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
150
- E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
151
- chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
152
- backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
153
- "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
154
- "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
155
- "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
156
- reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
157
- "minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
158
- "full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
159
- "full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
160
- detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
161
- This is disabled by default.
162
- defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
163
- on dlt 0.4.4 and later
164
- table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
165
- backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
166
- type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
167
- Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
168
-
169
- Returns:
170
- DltResource: The dlt resource for loading data from the SQL database table.
171
- """
172
- _detect_precision_hints_deprecated(detect_precision_hints)
173
-
174
- if detect_precision_hints:
175
- reflection_level = "full_with_precision"
176
- else:
177
- reflection_level = reflection_level or "minimal"
178
-
179
- engine = engine_from_credentials(credentials)
180
- engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
181
- metadata = metadata or MetaData(schema=schema)
182
-
183
- table_obj = Table(
184
- table, metadata, autoload_with=None if defer_table_reflect else engine
185
- )
186
- if table_adapter_callback and not defer_table_reflect:
187
- table_adapter_callback(table_obj)
188
-
189
- return dlt.resource(
190
- table_rows,
191
- name=table_obj.name,
192
- primary_key=get_primary_key(table_obj),
193
- columns=table_to_columns(table_obj, reflection_level, type_adapter_callback),
194
- merge_key=merge_key,
195
- )(
196
- engine,
197
- table_obj,
198
- chunk_size,
199
- backend,
200
- incremental=incremental,
201
- reflection_level=reflection_level,
202
- defer_table_reflect=defer_table_reflect,
203
- table_adapter_callback=table_adapter_callback,
204
- backend_kwargs=backend_kwargs,
205
- type_adapter_callback=type_adapter_callback,
206
- )
@@ -1,139 +0,0 @@
1
- from typing import Any, Optional, Sequence
2
-
3
- from dlt.common import logger
4
- from dlt.common.configuration import with_config
5
- from dlt.common.destination import DestinationCapabilitiesContext
6
- from dlt.common.json import custom_encode, map_nested_in_place
7
- from dlt.common.schema.typing import TTableSchemaColumns
8
-
9
- from .schema_types import RowAny
10
-
11
-
12
- @with_config
13
- def columns_to_arrow(
14
- columns_schema: TTableSchemaColumns,
15
- caps: DestinationCapabilitiesContext = None,
16
- tz: str = "UTC",
17
- ) -> Any:
18
- """Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
19
- is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
20
- Otherwise generic capabilities are used
21
- """
22
- from dlt.common.destination.capabilities import DestinationCapabilitiesContext
23
- from dlt.common.libs.pyarrow import get_py_arrow_datatype
24
- from dlt.common.libs.pyarrow import pyarrow as pa
25
-
26
- return pa.schema(
27
- [
28
- pa.field(
29
- name,
30
- get_py_arrow_datatype(
31
- schema_item,
32
- caps or DestinationCapabilitiesContext.generic_capabilities(),
33
- tz,
34
- ),
35
- nullable=schema_item.get("nullable", True),
36
- )
37
- for name, schema_item in columns_schema.items()
38
- if schema_item.get("data_type") is not None
39
- ]
40
- )
41
-
42
-
43
- def row_tuples_to_arrow(
44
- rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
45
- ) -> Any:
46
- """Converts the rows to an arrow table using the columns schema.
47
- Columns missing `data_type` will be inferred from the row data.
48
- Columns with object types not supported by arrow are excluded from the resulting table.
49
- """
50
- import numpy as np
51
- from dlt.common.libs.pyarrow import pyarrow as pa
52
-
53
- try:
54
- from pandas._libs import lib
55
-
56
- pivoted_rows = lib.to_object_array_tuples(rows).T # type: ignore[attr-defined]
57
- except ImportError:
58
- logger.info(
59
- "Pandas not installed, reverting to numpy.asarray to create a table which is slower"
60
- )
61
- pivoted_rows = np.asarray(rows, dtype="object", order="k").T # type: ignore[call-overload]
62
-
63
- columnar = {
64
- col: dat.ravel()
65
- for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
66
- }
67
- columnar_known_types = {
68
- col["name"]: columnar[col["name"]]
69
- for col in columns.values()
70
- if col.get("data_type") is not None
71
- }
72
- columnar_unknown_types = {
73
- col["name"]: columnar[col["name"]]
74
- for col in columns.values()
75
- if col.get("data_type") is None
76
- }
77
-
78
- arrow_schema = columns_to_arrow(columns, tz=tz)
79
-
80
- for idx in range(0, len(arrow_schema.names)):
81
- field = arrow_schema.field(idx)
82
- py_type = type(rows[0][idx])
83
- # cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
84
- if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
85
- logger.warning(
86
- f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
87
- )
88
- float_array = pa.array(columnar_known_types[field.name], type=pa.float64())
89
- columnar_known_types[field.name] = float_array.cast(field.type, safe=False)
90
-
91
- # If there are unknown type columns, first create a table to infer their types
92
- if columnar_unknown_types:
93
- new_schema_fields = []
94
- for key in list(columnar_unknown_types):
95
- arrow_col: Optional[pa.Array] = None
96
- try:
97
- arrow_col = pa.array(columnar_unknown_types[key])
98
- if pa.types.is_null(arrow_col.type):
99
- logger.warning(
100
- f"Column {key} contains only NULL values and data type could not be inferred. This column is removed from a arrow table"
101
- )
102
- continue
103
-
104
- except pa.ArrowInvalid as e:
105
- # Try coercing types not supported by arrow to a json friendly format
106
- # E.g. dataclasses -> dict, UUID -> str
107
- try:
108
- arrow_col = pa.array(
109
- map_nested_in_place(
110
- custom_encode, list(columnar_unknown_types[key])
111
- )
112
- )
113
- logger.warning(
114
- f"Column {key} contains a data type which is not supported by pyarrow and got converted into {arrow_col.type}. This slows down arrow table generation."
115
- )
116
- except (pa.ArrowInvalid, TypeError):
117
- logger.warning(
118
- f"Column {key} contains a data type which is not supported by pyarrow. This column will be ignored. Error: {e}"
119
- )
120
- if arrow_col is not None:
121
- columnar_known_types[key] = arrow_col
122
- new_schema_fields.append(
123
- pa.field(
124
- key,
125
- arrow_col.type,
126
- nullable=columns[key]["nullable"],
127
- )
128
- )
129
-
130
- # New schema
131
- column_order = {name: idx for idx, name in enumerate(columns)}
132
- arrow_schema = pa.schema(
133
- sorted(
134
- list(arrow_schema) + new_schema_fields,
135
- key=lambda x: column_order[x.name],
136
- )
137
- )
138
-
139
- return pa.Table.from_pydict(columnar_known_types, schema=arrow_schema)