ingestr 0.3.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (71) hide show
  1. {ingestr-0.3.1 → ingestr-0.3.3}/PKG-INFO +1 -1
  2. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/notion.md +1 -18
  3. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/main.py +58 -23
  4. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/__init__.py +1 -1
  5. ingestr-0.3.3/ingestr/src/version.py +1 -0
  6. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/test_append.db +0 -0
  7. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/test_create_replace.db +0 -0
  8. ingestr-0.3.1/ingestr/testdata/test_merge_with_primary_key.db → ingestr-0.3.3/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
  9. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
  10. ingestr-0.3.1/ingestr/testdata/test_delete_insert_with_timerange.db → ingestr-0.3.3/ingestr/testdata/test_merge_with_primary_key.db +0 -0
  11. ingestr-0.3.1/docs/supported-sources/images/notion_example.png +0 -0
  12. ingestr-0.3.1/ingestr/src/version.py +0 -1
  13. {ingestr-0.3.1 → ingestr-0.3.3}/.dockerignore +0 -0
  14. {ingestr-0.3.1 → ingestr-0.3.3}/.github/workflows/deploy-docs.yml +0 -0
  15. {ingestr-0.3.1 → ingestr-0.3.3}/.github/workflows/docker.yml +0 -0
  16. {ingestr-0.3.1 → ingestr-0.3.3}/.gitignore +0 -0
  17. {ingestr-0.3.1 → ingestr-0.3.3}/Dockerfile +0 -0
  18. {ingestr-0.3.1 → ingestr-0.3.3}/LICENSE.md +0 -0
  19. {ingestr-0.3.1 → ingestr-0.3.3}/Makefile +0 -0
  20. {ingestr-0.3.1 → ingestr-0.3.3}/README.md +0 -0
  21. {ingestr-0.3.1 → ingestr-0.3.3}/docs/.vitepress/config.mjs +0 -0
  22. {ingestr-0.3.1 → ingestr-0.3.3}/docs/.vitepress/theme/custom.css +0 -0
  23. {ingestr-0.3.1 → ingestr-0.3.3}/docs/.vitepress/theme/index.js +0 -0
  24. {ingestr-0.3.1 → ingestr-0.3.3}/docs/commands/example-uris.md +0 -0
  25. {ingestr-0.3.1 → ingestr-0.3.3}/docs/commands/ingest.md +0 -0
  26. {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/core-concepts.md +0 -0
  27. {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/incremental-loading.md +0 -0
  28. {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/quickstart.md +0 -0
  29. {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/telemetry.md +0 -0
  30. {ingestr-0.3.1 → ingestr-0.3.3}/docs/index.md +0 -0
  31. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/bigquery.md +0 -0
  32. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/csv.md +0 -0
  33. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/databricks.md +0 -0
  34. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/duckdb.md +0 -0
  35. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/mongodb.md +0 -0
  36. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/mssql.md +0 -0
  37. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/mysql.md +0 -0
  38. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/oracle.md +0 -0
  39. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/overview.md +0 -0
  40. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/postgres.md +0 -0
  41. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/redshift.md +0 -0
  42. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/snowflake.md +0 -0
  43. {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/sqlite.md +0 -0
  44. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/main_test.py +0 -0
  45. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/destinations.py +0 -0
  46. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/destinations_test.py +0 -0
  47. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/factory.py +0 -0
  48. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/factory_test.py +0 -0
  49. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/mongodb/__init__.py +0 -0
  50. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/mongodb/helpers.py +0 -0
  51. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/helpers/__init__.py +0 -0
  52. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/helpers/client.py +0 -0
  53. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/helpers/database.py +0 -0
  54. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/settings.py +0 -0
  55. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sources.py +0 -0
  56. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sources_test.py +0 -0
  57. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/__init__.py +0 -0
  58. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/helpers.py +0 -0
  59. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/override.py +0 -0
  60. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/schema_types.py +0 -0
  61. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/telemetry/event.py +0 -0
  62. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/testdata/fakebqcredentials.json +0 -0
  63. {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/.gitignore +0 -0
  64. {ingestr-0.3.1 → ingestr-0.3.3}/package-lock.json +0 -0
  65. {ingestr-0.3.1 → ingestr-0.3.3}/package.json +0 -0
  66. {ingestr-0.3.1 → ingestr-0.3.3}/pyproject.toml +0 -0
  67. {ingestr-0.3.1 → ingestr-0.3.3}/requirements-dev.txt +0 -0
  68. {ingestr-0.3.1 → ingestr-0.3.3}/requirements.txt +0 -0
  69. {ingestr-0.3.1 → ingestr-0.3.3}/resources/demo.gif +0 -0
  70. {ingestr-0.3.1 → ingestr-0.3.3}/resources/demo.tape +0 -0
  71. {ingestr-0.3.1 → ingestr-0.3.3}/resources/ingestr.svg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ingestr
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -25,24 +25,7 @@ Once you complete the guide, you should have an API key, and the table ID to con
25
25
  ingestr ingest --source-uri 'notion://?api_key=secret_12345' --source-table 'bfeaafc0c25f40a9asdasd672a9456f3' --dest-uri duckdb:///notion.duckdb --dest-table 'notion.output'
26
26
  ```
27
27
 
28
- The result of this command will be a bunch of tables in the `notion.duckdb` database. The Notion integration creates a bunch of extra tables in the schema to keep track of additional information about every field in a database. You should take some time to play around with the data and understand how it's structured, and take a good look at `_dlt_parent_id` column in the tables to understand the relationships between tables.
29
-
30
- Take a look at the following Notion table:
31
- ![an example Notion database](./images/notion_example.png)
32
-
33
- Ingesting this table using ingestr will create a bunch of new tables with quite a lot of details in them. The following query is a reconstruction of the table as it looks on Notion:
34
-
35
- ```sql
36
- select n.text__content, s.text__content, o.properties__numerical_value__number, r.text__content
37
- from notion.output o
38
- join notion.output__properties__name__title n on n._dlt_parent_id = o._dlt_id
39
- join notion.output__properties__another_col__rich_text r on r._dlt_parent_id = o._dlt_id
40
- join notion.output__properties__second_value__rich_text s on s._dlt_parent_id = o._dlt_id
41
- order by 1;
42
- ```
43
-
44
- Take this as a starting point and play around with the data.
45
-
28
+ The result of this command will be a table in the `notion.duckdb` database with JSON columns.
46
29
 
47
30
  > [!CAUTION]
48
31
  > Notion does not support incremental loading, which means every time you run the command, it will copy the entire table from Notion to the destination. This can be slow for large tables.
@@ -1,5 +1,6 @@
1
1
  import hashlib
2
2
  from datetime import datetime
3
+ from enum import Enum
3
4
  from typing import Optional
4
5
 
5
6
  import dlt
@@ -41,6 +42,9 @@ PARQUET_SUPPORTED_DESTINATIONS = [
41
42
  "synapse",
42
43
  ]
43
44
 
45
+ # these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
46
+ JSON_RETURNING_SOURCES = ["notion"]
47
+
44
48
 
45
49
  class SpinnerCollector(Collector):
46
50
  status: Status
@@ -80,6 +84,30 @@ class SpinnerCollector(Collector):
80
84
  self.status.stop()
81
85
 
82
86
 
87
+ class IncrementalStrategy(str, Enum):
88
+ create_replace = "replace"
89
+ append = "append"
90
+ delete_insert = "delete+insert"
91
+ merge = "merge"
92
+
93
+
94
+ class LoaderFileFormat(str, Enum):
95
+ jsonl = "jsonl"
96
+ parquet = "parquet"
97
+ insert_values = "insert_values"
98
+ csv = "csv"
99
+
100
+
101
+ class SqlBackend(str, Enum):
102
+ sqlalchemy = "sqlalchemy"
103
+ pyarrow = "pyarrow"
104
+
105
+
106
+ class Progress(str, Enum):
107
+ interactive = "interactive"
108
+ log = "log"
109
+
110
+
83
111
  @app.command()
84
112
  def ingest(
85
113
  source_uri: Annotated[
@@ -114,12 +142,12 @@ def ingest(
114
142
  ),
115
143
  ] = None, # type: ignore
116
144
  incremental_strategy: Annotated[
117
- str,
145
+ IncrementalStrategy,
118
146
  typer.Option(
119
- help="The incremental strategy to use, must be one of 'replace', 'append', 'delete+insert', or 'merge'",
147
+ help="The incremental strategy to use",
120
148
  envvar="INCREMENTAL_STRATEGY",
121
149
  ),
122
- ] = "replace", # type: ignore
150
+ ] = IncrementalStrategy.create_replace, # type: ignore
123
151
  interval_start: Annotated[
124
152
  Optional[datetime],
125
153
  typer.Option(
@@ -158,26 +186,26 @@ def ingest(
158
186
  ),
159
187
  ] = False, # type: ignore
160
188
  progress: Annotated[
161
- Optional[str],
189
+ Progress,
162
190
  typer.Option(
163
191
  help="The progress display type, must be one of 'interactive', 'log'",
164
192
  envvar="PROGRESS",
165
193
  ),
166
- ] = "interactive", # type: ignore
194
+ ] = Progress.interactive, # type: ignore
167
195
  sql_backend: Annotated[
168
- Optional[str],
196
+ SqlBackend,
169
197
  typer.Option(
170
- help="The SQL backend to use, must be one of 'sqlalchemy', 'pyarrow'",
198
+ help="The SQL backend to use",
171
199
  envvar="SQL_BACKEND",
172
200
  ),
173
- ] = "pyarrow", # type: ignore
201
+ ] = SqlBackend.pyarrow, # type: ignore
174
202
  loader_file_format: Annotated[
175
- Optional[str],
203
+ Optional[LoaderFileFormat],
176
204
  typer.Option(
177
- help="The file format to use when loading data, must be one of 'jsonl', 'parquet', 'default'",
205
+ help="The file format to use when loading data",
178
206
  envvar="LOADER_FILE_FORMAT",
179
207
  ),
180
- ] = "default", # type: ignore
208
+ ] = None, # type: ignore
181
209
  ):
182
210
  track(
183
211
  "command_triggered",
@@ -207,15 +235,15 @@ def ingest(
207
235
  original_incremental_strategy = incremental_strategy
208
236
 
209
237
  merge_key = None
210
- if incremental_strategy == "delete+insert":
238
+ if incremental_strategy == IncrementalStrategy.delete_insert:
211
239
  merge_key = incremental_key
212
- incremental_strategy = "merge"
240
+ incremental_strategy = IncrementalStrategy.merge
213
241
 
214
242
  m = hashlib.sha256()
215
243
  m.update(dest_table.encode("utf-8"))
216
244
 
217
245
  progressInstance: Collector = SpinnerCollector()
218
- if progress == "log":
246
+ if progress == Progress.log:
219
247
  progressInstance = LogCollector()
220
248
 
221
249
  pipeline = dlt.pipeline(
@@ -237,7 +265,7 @@ def ingest(
237
265
  f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
238
266
  )
239
267
  print(
240
- f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy}"
268
+ f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy.value}"
241
269
  )
242
270
  print(
243
271
  f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
@@ -263,19 +291,24 @@ def ingest(
263
291
  merge_key=merge_key,
264
292
  interval_start=interval_start,
265
293
  interval_end=interval_end,
266
- sql_backend=sql_backend,
294
+ sql_backend=sql_backend.value,
267
295
  )
268
296
 
269
- if original_incremental_strategy == "delete+insert":
297
+ if original_incremental_strategy == IncrementalStrategy.delete_insert:
270
298
  dlt_source.incremental.primary_key = ()
271
299
 
272
300
  if (
273
301
  factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
274
- and loader_file_format == "default"
302
+ and loader_file_format is None
275
303
  ):
276
- loader_file_format = "parquet"
277
- elif loader_file_format == "default":
278
- loader_file_format = "jsonl"
304
+ loader_file_format = LoaderFileFormat.parquet
305
+
306
+ # if the source is a JSON returning source, we cannot use Parquet loader for BigQuery
307
+ if (
308
+ factory.destination_scheme == "bigquery"
309
+ and factory.source_scheme in JSON_RETURNING_SOURCES
310
+ ):
311
+ loader_file_format = None
279
312
 
280
313
  run_info = pipeline.run(
281
314
  dlt_source,
@@ -283,9 +316,11 @@ def ingest(
283
316
  uri=dest_uri,
284
317
  table=dest_table,
285
318
  ),
286
- write_disposition=incremental_strategy, # type: ignore
319
+ write_disposition=incremental_strategy.value, # type: ignore
287
320
  primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
288
- loader_file_format=loader_file_format, # type: ignore
321
+ loader_file_format=loader_file_format.value
322
+ if loader_file_format is not None
323
+ else None, # type: ignore
289
324
  )
290
325
 
291
326
  destination.post_load()
@@ -9,7 +9,7 @@ from .helpers.client import NotionClient
9
9
  from .helpers.database import NotionDatabase
10
10
 
11
11
 
12
- @dlt.source
12
+ @dlt.source(max_table_nesting=1)
13
13
  def notion_databases(
14
14
  database_ids: Optional[List[Dict[str, str]]] = None,
15
15
  api_key: str = dlt.secrets.value,
@@ -0,0 +1 @@
1
+ __version__ = "0.3.3"
@@ -1 +0,0 @@
1
- __version__ = "0.3.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes