ingestr 0.8.4__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (133) hide show
  1. {ingestr-0.8.4 → ingestr-0.9.1}/PKG-INFO +14 -3
  2. {ingestr-0.8.4 → ingestr-0.9.1}/README.md +10 -0
  3. {ingestr-0.8.4 → ingestr-0.9.1}/docs/.vitepress/config.mjs +2 -0
  4. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/gsheets.md +10 -5
  5. ingestr-0.9.1/docs/supported-sources/s3.md +39 -0
  6. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/stripe.md +5 -0
  7. ingestr-0.9.1/docs/supported-sources/zendesk.md +84 -0
  8. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/factory.py +6 -0
  9. ingestr-0.9.1/ingestr/src/filesystem/__init__.py +98 -0
  10. ingestr-0.9.1/ingestr/src/filesystem/helpers.py +100 -0
  11. ingestr-0.9.1/ingestr/src/filesystem/readers.py +131 -0
  12. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/shopify/__init__.py +3 -1
  13. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/sources.py +149 -2
  14. ingestr-0.9.1/ingestr/src/version.py +1 -0
  15. ingestr-0.9.1/ingestr/src/zendesk/__init__.py +460 -0
  16. ingestr-0.9.1/ingestr/src/zendesk/helpers/__init__.py +25 -0
  17. ingestr-0.9.1/ingestr/src/zendesk/helpers/api_helpers.py +105 -0
  18. ingestr-0.9.1/ingestr/src/zendesk/helpers/credentials.py +54 -0
  19. ingestr-0.9.1/ingestr/src/zendesk/helpers/talk_api.py +118 -0
  20. ingestr-0.9.1/ingestr/src/zendesk/settings.py +57 -0
  21. {ingestr-0.8.4 → ingestr-0.9.1}/pyproject.toml +9 -0
  22. {ingestr-0.8.4 → ingestr-0.9.1}/requirements.txt +2 -1
  23. ingestr-0.8.4/ingestr/src/version.py +0 -1
  24. {ingestr-0.8.4 → ingestr-0.9.1}/.dockerignore +0 -0
  25. {ingestr-0.8.4 → ingestr-0.9.1}/.github/workflows/deploy-docs.yml +0 -0
  26. {ingestr-0.8.4 → ingestr-0.9.1}/.github/workflows/tests.yml +0 -0
  27. {ingestr-0.8.4 → ingestr-0.9.1}/.gitignore +0 -0
  28. {ingestr-0.8.4 → ingestr-0.9.1}/.python-version +0 -0
  29. {ingestr-0.8.4 → ingestr-0.9.1}/Dockerfile +0 -0
  30. {ingestr-0.8.4 → ingestr-0.9.1}/LICENSE.md +0 -0
  31. {ingestr-0.8.4 → ingestr-0.9.1}/Makefile +0 -0
  32. {ingestr-0.8.4 → ingestr-0.9.1}/docs/.vitepress/theme/custom.css +0 -0
  33. {ingestr-0.8.4 → ingestr-0.9.1}/docs/.vitepress/theme/index.js +0 -0
  34. {ingestr-0.8.4 → ingestr-0.9.1}/docs/commands/example-uris.md +0 -0
  35. {ingestr-0.8.4 → ingestr-0.9.1}/docs/commands/ingest.md +0 -0
  36. {ingestr-0.8.4 → ingestr-0.9.1}/docs/getting-started/core-concepts.md +0 -0
  37. {ingestr-0.8.4 → ingestr-0.9.1}/docs/getting-started/incremental-loading.md +0 -0
  38. {ingestr-0.8.4 → ingestr-0.9.1}/docs/getting-started/quickstart.md +0 -0
  39. {ingestr-0.8.4 → ingestr-0.9.1}/docs/getting-started/telemetry.md +0 -0
  40. {ingestr-0.8.4 → ingestr-0.9.1}/docs/index.md +0 -0
  41. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/adjust.md +0 -0
  42. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/airtable.md +0 -0
  43. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/appsflyer.md +0 -0
  44. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/bigquery.md +0 -0
  45. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/chess.md +0 -0
  46. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/csv.md +0 -0
  47. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/databricks.md +0 -0
  48. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/duckdb.md +0 -0
  49. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/facebook-ads.md +0 -0
  50. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/gorgias.md +0 -0
  51. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/hubspot.md +0 -0
  52. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/kafka.md +0 -0
  53. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/klaviyo.md +0 -0
  54. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/mongodb.md +0 -0
  55. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/mssql.md +0 -0
  56. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/mysql.md +0 -0
  57. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/notion.md +0 -0
  58. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/oracle.md +0 -0
  59. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/postgres.md +0 -0
  60. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/redshift.md +0 -0
  61. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/sap-hana.md +0 -0
  62. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/shopify.md +0 -0
  63. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/slack.md +0 -0
  64. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/snowflake.md +0 -0
  65. {ingestr-0.8.4 → ingestr-0.9.1}/docs/supported-sources/sqlite.md +0 -0
  66. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/main.py +0 -0
  67. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/.gitignore +0 -0
  68. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/adjust/_init_.py +0 -0
  69. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/adjust/helpers.py +0 -0
  70. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/airtable/__init__.py +0 -0
  71. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/appsflyer/_init_.py +0 -0
  72. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/appsflyer/client.py +0 -0
  73. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/chess/__init__.py +0 -0
  74. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/chess/helpers.py +0 -0
  75. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/chess/settings.py +0 -0
  76. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/destinations.py +0 -0
  77. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/facebook_ads/__init__.py +0 -0
  78. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/facebook_ads/exceptions.py +0 -0
  79. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/facebook_ads/helpers.py +0 -0
  80. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/facebook_ads/settings.py +0 -0
  81. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/google_sheets/README.md +0 -0
  82. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/google_sheets/__init__.py +0 -0
  83. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/google_sheets/helpers/__init__.py +0 -0
  84. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/google_sheets/helpers/api_calls.py +0 -0
  85. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/google_sheets/helpers/data_processing.py +0 -0
  86. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/gorgias/__init__.py +0 -0
  87. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/gorgias/helpers.py +0 -0
  88. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/hubspot/__init__.py +0 -0
  89. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/hubspot/helpers.py +0 -0
  90. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/hubspot/settings.py +0 -0
  91. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/kafka/__init__.py +0 -0
  92. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/kafka/helpers.py +0 -0
  93. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/klaviyo/_init_.py +0 -0
  94. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/klaviyo/client.py +0 -0
  95. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/klaviyo/helpers.py +0 -0
  96. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/mongodb/__init__.py +0 -0
  97. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/mongodb/helpers.py +0 -0
  98. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/notion/__init__.py +0 -0
  99. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/notion/helpers/__init__.py +0 -0
  100. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/notion/helpers/client.py +0 -0
  101. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/notion/helpers/database.py +0 -0
  102. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/notion/settings.py +0 -0
  103. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/shopify/exceptions.py +0 -0
  104. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/shopify/helpers.py +0 -0
  105. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/shopify/settings.py +0 -0
  106. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/slack/__init__.py +0 -0
  107. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/slack/helpers.py +0 -0
  108. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/slack/settings.py +0 -0
  109. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/sql_database/__init__.py +0 -0
  110. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/sql_database/arrow_helpers.py +0 -0
  111. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/sql_database/helpers.py +0 -0
  112. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/sql_database/override.py +0 -0
  113. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/sql_database/schema_types.py +0 -0
  114. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/stripe_analytics/__init__.py +0 -0
  115. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/stripe_analytics/helpers.py +0 -0
  116. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/stripe_analytics/settings.py +0 -0
  117. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/table_definition.py +0 -0
  118. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/telemetry/event.py +0 -0
  119. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/src/testdata/fakebqcredentials.json +0 -0
  120. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/.gitignore +0 -0
  121. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/create_replace.csv +0 -0
  122. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/delete_insert_expected.csv +0 -0
  123. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/delete_insert_part1.csv +0 -0
  124. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/delete_insert_part2.csv +0 -0
  125. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/merge_expected.csv +0 -0
  126. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/merge_part1.csv +0 -0
  127. {ingestr-0.8.4 → ingestr-0.9.1}/ingestr/testdata/merge_part2.csv +0 -0
  128. {ingestr-0.8.4 → ingestr-0.9.1}/package-lock.json +0 -0
  129. {ingestr-0.8.4 → ingestr-0.9.1}/package.json +0 -0
  130. {ingestr-0.8.4 → ingestr-0.9.1}/requirements-dev.txt +0 -0
  131. {ingestr-0.8.4 → ingestr-0.9.1}/resources/demo.gif +0 -0
  132. {ingestr-0.8.4 → ingestr-0.9.1}/resources/demo.tape +0 -0
  133. {ingestr-0.8.4 → ingestr-0.9.1}/resources/ingestr.svg +0 -0
@@ -1,11 +1,10 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ingestr
3
- Version: 0.8.4
3
+ Version: 0.9.1
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
7
7
  Author-email: Burak Karakan <burak.karakan@getbruin.com>
8
- License-File: LICENSE.md
9
8
  Classifier: Development Status :: 4 - Beta
10
9
  Classifier: Environment :: Console
11
10
  Classifier: Intended Audience :: Developers
@@ -15,7 +14,6 @@ Classifier: Programming Language :: Python :: 3
15
14
  Classifier: Topic :: Database
16
15
  Requires-Python: >=3.9
17
16
  Requires-Dist: confluent-kafka>=2.3.0
18
- Requires-Dist: cx-oracle==8.3.0
19
17
  Requires-Dist: databricks-sql-connector==2.9.3
20
18
  Requires-Dist: dlt==0.5.1
21
19
  Requires-Dist: duckdb-engine==0.11.5
@@ -35,6 +33,7 @@ Requires-Dist: pyrate-limiter==3.6.1
35
33
  Requires-Dist: redshift-connector==2.1.0
36
34
  Requires-Dist: rich==13.7.1
37
35
  Requires-Dist: rudder-sdk-python==2.1.0
36
+ Requires-Dist: s3fs==2024.9.0
38
37
  Requires-Dist: snowflake-sqlalchemy==1.5.3
39
38
  Requires-Dist: sqlalchemy-bigquery==1.11.0
40
39
  Requires-Dist: sqlalchemy-hana==2.0.0
@@ -45,6 +44,8 @@ Requires-Dist: stripe==10.7.0
45
44
  Requires-Dist: tqdm==4.66.2
46
45
  Requires-Dist: typer==0.12.3
47
46
  Requires-Dist: types-requests==2.32.0.20240907
47
+ Provides-Extra: oracle
48
+ Requires-Dist: cx-oracle==8.3.0; extra == 'oracle'
48
49
  Description-Content-Type: text/markdown
49
50
 
50
51
  <div align="center">
@@ -226,6 +227,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
226
227
  <td>Notion</td>
227
228
  <td>✅</td>
228
229
  <td>-</td>
230
+ </tr>
231
+ <tr>
232
+ <td>S3</td>
233
+ <td>✅</td>
234
+ <td>-</td>
229
235
  </tr>
230
236
  <tr>
231
237
  <td>Shopify</td>
@@ -242,6 +248,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
242
248
  <td>✅</td>
243
249
  <td>-</td>
244
250
  </tr>
251
+ <tr>
252
+ <td>Zendesk</td>
253
+ <td>✅</td>
254
+ <td>-</td>
255
+ </tr>
245
256
  </table>
246
257
 
247
258
  More to come soon!
@@ -177,6 +177,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
177
177
  <td>Notion</td>
178
178
  <td>✅</td>
179
179
  <td>-</td>
180
+ </tr>
181
+ <tr>
182
+ <td>S3</td>
183
+ <td>✅</td>
184
+ <td>-</td>
180
185
  </tr>
181
186
  <tr>
182
187
  <td>Shopify</td>
@@ -193,6 +198,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
193
198
  <td>✅</td>
194
199
  <td>-</td>
195
200
  </tr>
201
+ <tr>
202
+ <td>Zendesk</td>
203
+ <td>✅</td>
204
+ <td>-</td>
205
+ </tr>
196
206
  </table>
197
207
 
198
208
  More to come soon!
@@ -97,9 +97,11 @@ export default defineConfig({
97
97
  { text: "HubSpot", link: "/supported-sources/hubspot.md" },
98
98
  { text: "Klaviyo", link: "/supported-sources/klaviyo.md" },
99
99
  { text: "Notion", link: "/supported-sources/notion.md" },
100
+ { text: "S3", link: "/supported-sources/s3.md" },
100
101
  { text: "Shopify", link: "/supported-sources/shopify.md" },
101
102
  { text: "Slack", link: "/supported-sources/slack.md" },
102
103
  { text: "Stripe", link: "/supported-sources/stripe.md" },
104
+ { text: "Zendesk", link: "/supported-sources/zendesk.md" },
103
105
  ],
104
106
  },
105
107
  ],
@@ -1,9 +1,11 @@
1
1
  # Google Sheets
2
+
2
3
  [Google Sheets](https://www.google.com/sheets/about/) is a web-based spreadsheet program that is part of Google's free, web-based Google Docs Editors suite.
3
4
 
4
5
  ingestr supports Google Sheets as a source.
5
6
 
6
7
  ## URI Format
8
+
7
9
  The URI format for Google Sheets is as follows:
8
10
 
9
11
  ```
@@ -11,11 +13,13 @@ gsheets://?credentials_path=/path/to/service/account.json
11
13
  ```
12
14
 
13
15
  Alternatively, you can use base64 encoded credentials:
16
+
14
17
  ```
15
18
  gsheets://?credentials_base64=<base64_encoded_credentials>
16
19
  ```
17
20
 
18
21
  URI parameters:
22
+
19
23
  - `credentials_path`: the path to the service account JSON file
20
24
 
21
25
  The URI is used to connect to the Google Sheets API for extracting data.
@@ -24,15 +28,16 @@ The URI is used to connect to the Google Sheets API for extracting data.
24
28
 
25
29
  Google Sheets requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#setup-guide).
26
30
 
27
- Once you complete the guide, you should have a service account JSON file, and the spreadsheet ID to connect to. Let's say:
28
- - you store your JSON file in path `/path/to/file.json`
29
- - the spreadsheet you'd like to connect to is `abcdxyz`,
30
- - the sheet inside the spreadsheet is `Sheet1`
31
+ Once you complete the guide, you should have a service account JSON file and the spreadsheet ID to connect to. Let's say:
32
+
33
+ - you store your JSON file at the path `/path/to/file.json`.
34
+ - the spreadsheet you'd like to connect to has the ID `fkdUQ2bjdNfUq2CA`. For example, if your spreadsheet URL is `https://docs.google.com/spreadsheets/d/fkdUQ2bjdNfUq2CA/edit?pli=1&gid=0#gid=0`, then the spreadsheet ID is `fkdUQ2bjdNfUq2CA`.
35
+ - the sheet inside the spreadsheet is `Sheet1`.
31
36
 
32
37
  Based on this assumption, here's a sample command that will copy the data from the Google Sheets spreadsheet into a duckdb database:
33
38
 
34
39
  ```sh
35
- ingestr ingest --source-uri 'gsheets://?credentials_path=/path/to/file.json' --source-table 'abcdxyz.Sheet1' --dest-uri duckdb:///gsheets.duckdb --dest-table 'gsheets.output'
40
+ ingestr ingest --source-uri 'gsheets://?credentials_path=/path/to/file.json' --source-table 'fkdUQ2bjdNfUq2CA.Sheet1' --dest-uri duckdb:///gsheets.duckdb --dest-table 'gsheets.output'
36
41
  ```
37
42
 
38
43
  The result of this command will be a table in the \`gsheets.duckdb\` database.
@@ -0,0 +1,39 @@
1
+ # S3
2
+
3
+ [S3](https://aws.amazon.com/s3/) is a bucket for storing data in Amazon's Simple Storage Service, a cloud-based storage solution provided by AWS. S3 buckets allow users to store and retrieve data at any time from anywhere on the web.
4
+
5
+ ingestr supports S3 as a source.
6
+
7
+ ## URI Format
8
+
9
+ The URI format for S3 is as follows:
10
+
11
+ ```plaintext
12
+ s3://<bucket_name>/<path_to_file>?access_key_id=<access_key_id>&secret_access_key=<secret_access_key>
13
+ ```
14
+
15
+ URI parameters:
16
+
17
+ - `bucket_name`: The name of the bucket
18
+ - `path_to_files`: The relative path from the root of the bucket. You can find this from the S3 URI. For example, if your S3 URI is `s3://mybucket/students/students_details.csv`, then your bucket name is `mybucket` and path_to_files is `students/students_details.csv`.
19
+ - `access_key_id` and `secret_access_key` : Used for accessing S3 bucket.
20
+
21
+ ## Setting up a S3 Integration
22
+
23
+ S3 requires access_key_id and secret_access_key. Please follow the guide on dltHub to [obtain credentials](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#get-credentials). Once you've completed the guide, you should have an `access_key_id` and `secret_access_key`. From the S3 URI, you can extract the `bucket_name` and `path_to_files`
24
+
25
+ For example, if your access_key_id is `AKC3YOW7E`, secret_access_key is `XCtkpL5B`, bucket name is `my_bucket`, and path_to_files is `students/students_details.csv`, here's a sample command that will copy the data from the S3 bucket into a DuckDB database:
26
+
27
+ ```sh
28
+ ingestr ingest --source-uri 's3://my_bucket/students/students_details.csv?access_key_id=AKC3YOW7E&secret_access_key=XCtkpL5B' --source-table 'students_details' --dest-uri duckdb:///s3.duckdb --dest-table 'dest.students_details'
29
+ ```
30
+
31
+ The result of this command will be a table in the s3.duckdb database.
32
+
33
+ Below are some examples of file path patterns, each path pattern is a reference from the root of the bucket:
34
+
35
+ - `**/*.csv`: Retrieves all .csv files, regardless of how deep they are within the folder structure.
36
+ - `*.csv`: Retrieves all .csv files from the first level of a folder.
37
+ - `myFolder/**/*.jsonl`: Retrieves all .jsonl files from anywhere under myFolder.
38
+ - `myFolder/mySubFolder/users.parquet`: Retrieves the users.parquet file from mySubFolder.
39
+ - `employees.jsonl`: Retrieves the employees.jsonl file from the root level of bucket.
@@ -1,9 +1,11 @@
1
1
  # Stripe
2
+
2
3
  [Stripe](https://www.stripe.com/) is a technology company that builds economic infrastructure for the internet, providing payment processing software and APIs for e-commerce websites and mobile applications.
3
4
 
4
5
  ingestr supports Stripe as a source.
5
6
 
6
7
  ## URI Format
8
+
7
9
  The URI format for Stripe is as follows:
8
10
 
9
11
  ```plaintext
@@ -11,6 +13,7 @@ stripe://?api_key=<api-key-here>
11
13
  ```
12
14
 
13
15
  URI parameters:
16
+
14
17
  - `api_key`: the API key used for authentication with the Stripe API
15
18
 
16
19
  The URI is used to connect to the Stripe API for extracting data. More details on setting up Stripe integrations can be found [here](https://stripe.com/docs/api).
@@ -28,7 +31,9 @@ ingestr ingest --source-uri 'stripe://?api_key=sk_test_12345' --source-table 'ch
28
31
  The result of this command will be a table in the `stripe.duckdb` database with JSON columns.
29
32
 
30
33
  ## Available Tables
34
+
31
35
  Stripe source allows ingesting the following sources into separate tables:
36
+
32
37
  - `subscription`: Represents a customer's subscription to a recurring service, detailing billing cycles, plans, and status.
33
38
  - `account`: Contains information about a Stripe account, including balances, payouts, and account settings.
34
39
  - `coupon`: Stores data about discount codes or coupons that can be applied to invoices, subscriptions, or other charges.
@@ -0,0 +1,84 @@
1
+ # Zendesk
2
+
3
+ [Zendesk](https://www.zendesk.com/) is a cloud-based customer service and support platform. It offers a range of features including ticket management, self-service options, knowledgebase management, live chat, customer analytics, and conversations.
4
+
5
+ ingestr supports Zendesk as a source.
6
+
7
+ The Zendesk supports two authentication methods when connecting through ingestr:
8
+ - OAuth Token
9
+ - API Token
10
+
11
+ For all resources except chat resources, you can use either the [API Token](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk#grab-zendesk-support-api-token) or the Zendesk Support [OAuth Token](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk#zendesk-support-oauth-token) to fetch data. However, for chat resources, you must use the [OAuth Token](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk#zendesk-chat) specific to Zendesk Chat.
12
+
13
+ ## URI Format
14
+
15
+ The URI format for Zendesk based on the authentication method:
16
+ ### For OAuth Token Authentication:
17
+ ```plaintext
18
+ zendesk://:<oauth_token>@<sub-domain>
19
+ ```
20
+ ### For API Token Authentication:
21
+ ```plaintext
22
+ zendesk://<email>:<api_token>@<sub-domain>
23
+ ```
24
+
25
+ URI parameters:
26
+
27
+ - `subdomain`: Unique zendesk subdomain that can be found in account url. For example, if your account url is https://My_Company.zendesk.com/, then `My_Company` is your subdomain
28
+ - `email`: Email address of the user
29
+ - `api_token`: API token used for authentication with zendesk
30
+ - `oauth_token`: OAuth token used for authentication with zendesk
31
+
32
+ ## Setting up a Zendesk Integration
33
+
34
+ Zendesk requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk#setup-guide).
35
+
36
+ Once you complete the guide, if you decide to use an OAuth Token, you should have a subdomain and an OAuth token. Let’s say your subdomain is `mycompany` and your OAuth token is `qVsbdiasVt`.
37
+
38
+ ```sh
39
+ ingestr ingest --source-uri "zendesk://:qVsbdiasVt@mycompany" \
40
+ --source-table 'tickets' \
41
+ --dest-uri 'duckdb:///zendesk.duckdb' \
42
+ --dest-table 'zendesk.tickets' \
43
+ --interval-start '2024-01-01'
44
+ ```
45
+
46
+ If you decide to use an API Token, you should have a subdomain, email, and API token. Let’s say your subdomain is `mycompany`, your email is `john@get.com`, and your API token is `nbs123`.
47
+
48
+ ```sh
49
+ ingestr ingest --source-uri "zendesk://john@get.com:nbs123@mycompany" \
50
+ --source-table 'tickets' \
51
+ --dest-uri 'duckdb:///zendesk.duckdb' \
52
+ --dest-table 'zendesk.tickets' \
53
+ --interval-start '2024-01-01'
54
+ ```
55
+
56
+ The result of this command will be a table in the `zendesk.duckdb` database.
57
+
58
+ ## Available Tables
59
+
60
+ Zendesk source allows ingesting the following sources into separate tables:
61
+
62
+ - [activities](https://developer.zendesk.com/api-reference/ticketing/tickets/activity_stream/): Retrieves ticket activities affecting the agent
63
+ - [addresses](https://developer.zendesk.com/api-reference/voice/talk-api/addresses/): Retrieves addresses information
64
+ - [agents_activity](https://developer.zendesk.com/api-reference/voice/talk-api/stats/#list-agents-activity): Retrieves activity information for agents
65
+ - [automations](https://developer.zendesk.com/api-reference/ticketing/business-rules/automations/): Retrives automations for the current account
66
+ - [brands](https://developer.zendesk.com/api-reference/ticketing/account-configuration/brands/): Retrieves all brands for your account
67
+ - [calls](https://developer.zendesk.com/api-reference/voice/talk-api/incremental_exports/#incremental-calls-export): Retrieves all calls specific to channels
68
+ - [chats](https://developer.zendesk.com/api-reference/live-chat/chat-api/incremental_export/): Retrieves available chats
69
+ - [greetings](https://developer.zendesk.com/api-reference/voice/talk-api/greetings/): Retrieves all default or customs greetings
70
+ - [groups](https://developer.zendesk.com/api-reference/ticketing/groups/groups/): Retrieves groups of support agents
71
+ - [legs_incremental](https://developer.zendesk.com/api-reference/voice/talk-api/incremental_exports/#incremental-call-legs-export): Retrieves detailed information about each agent involved in a call
72
+ - [lines](https://developer.zendesk.com/api-reference/voice/talk-api/lines/): Retrieves all available lines (phone numbers and digital lines) in your Zendesk voice account
73
+ - [organizations](https://developer.zendesk.com/api-reference/ticketing/organizations/organizations/) : Retrieves organizations (your customers can be grouped into organizations by their email domain)
74
+ - [phone_numbers](https://developer.zendesk.com/api-reference/voice/talk-api/phone_numbers/): Retrieves all available phone numbers
75
+ - [settings](https://developer.zendesk.com/api-reference/voice/talk-api/voice_settings/): Retrieves account settings related to Zendesk voice accounts
76
+ - [sla_policies](https://developer.zendesk.com/api-reference/ticketing/business-rules/sla_policies/): Retrives different sla policies.
77
+ - [targets](https://developer.zendesk.com/api-reference/ticketing/targets/targets/): Retrieves targets where as targets are data from Zendesk to external applications like Slack when a ticket is updated or created.
78
+ - [tickets](https://developer.zendesk.com/api-reference/ticketing/tickets/tickets/): Retrieves all tickets, which are the means through which customers communicate with agents
79
+ - [ticket_forms](https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_forms/): Retrieves all ticket forms
80
+ - [ticket_metrics](https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_metrics/): Retrieves various metrics about one or more tickets.
81
+ - [ticket_metric_events](https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_metric_events/): Retrieves ticket metric events that occurred on or after the start time
82
+ - [users](https://developer.zendesk.com/api-reference/ticketing/users/users/): Retrieves all users
83
+
84
+ Use these as `--source-table` parameter in the `ingestr ingest` command.
@@ -28,10 +28,12 @@ from ingestr.src.sources import (
28
28
  LocalCsvSource,
29
29
  MongoDbSource,
30
30
  NotionSource,
31
+ S3Source,
31
32
  ShopifySource,
32
33
  SlackSource,
33
34
  SqlSource,
34
35
  StripeAnalyticsSource,
36
+ ZendeskSource,
35
37
  )
36
38
 
37
39
  SQL_SOURCE_SCHEMES = [
@@ -132,6 +134,10 @@ class SourceDestinationFactory:
132
134
  return KafkaSource()
133
135
  elif self.source_scheme == "adjust":
134
136
  return AdjustSource()
137
+ elif self.source_scheme == "zendesk":
138
+ return ZendeskSource()
139
+ elif self.source_scheme == "s3":
140
+ return S3Source()
135
141
  else:
136
142
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
137
143
 
@@ -0,0 +1,98 @@
1
+ """Reads files in s3, gs or azure buckets using fsspec and provides convenience resources for chunked reading of various file formats"""
2
+
3
+ from typing import Iterator, List, Optional, Tuple, Union
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+ from dlt.sources.credentials import FileSystemCredentials
8
+ from dlt.sources.filesystem import FileItem, FileItemDict, fsspec_filesystem, glob_files
9
+
10
+ from .helpers import (
11
+ AbstractFileSystem,
12
+ FilesystemConfigurationResource,
13
+ )
14
+ from .readers import (
15
+ ReadersSource,
16
+ _read_csv,
17
+ _read_csv_duckdb,
18
+ _read_jsonl,
19
+ _read_parquet,
20
+ )
21
+
22
+
23
+ @dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource)
24
+ def readers(
25
+ bucket_url: str,
26
+ credentials: Union[FileSystemCredentials, AbstractFileSystem],
27
+ file_glob: Optional[str] = "*",
28
+ ) -> Tuple[DltResource, ...]:
29
+ """This source provides a few resources that are chunked file readers. Readers can be further parametrized before use
30
+ read_csv(chunksize, **pandas_kwargs)
31
+ read_jsonl(chunksize)
32
+ read_parquet(chunksize)
33
+
34
+ Args:
35
+ bucket_url (str): The url to the bucket.
36
+ credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
37
+ file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
38
+ """
39
+ filesystem_resource = filesystem(bucket_url, credentials, file_glob=file_glob)
40
+ filesystem_resource.apply_hints(
41
+ incremental=dlt.sources.incremental("modification_date")
42
+ )
43
+ return (
44
+ filesystem_resource | dlt.transformer(name="read_csv")(_read_csv),
45
+ filesystem_resource | dlt.transformer(name="read_jsonl")(_read_jsonl),
46
+ filesystem_resource | dlt.transformer(name="read_parquet")(_read_parquet),
47
+ filesystem_resource | dlt.transformer(name="read_csv_duckdb")(_read_csv_duckdb),
48
+ )
49
+
50
+
51
+ @dlt.resource(
52
+ primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True
53
+ )
54
+ def filesystem(
55
+ bucket_url: str = dlt.secrets.value,
56
+ credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value,
57
+ file_glob: Optional[str] = "*",
58
+ files_per_page: int = 100,
59
+ extract_content: bool = True,
60
+ ) -> Iterator[List[FileItem]]:
61
+ """This resource lists files in `bucket_url` using `file_glob` pattern. The files are yielded as FileItem which also
62
+ provide methods to open and read file data. It should be combined with transformers that further process (ie. load files)
63
+
64
+ Args:
65
+ bucket_url (str): The url to the bucket.
66
+ credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
67
+ file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
68
+ files_per_page (int, optional): The number of files to process at once, defaults to 100.
69
+ extract_content (bool, optional): If true, the content of the file will be extracted if
70
+ false it will return a fsspec file, defaults to False.
71
+
72
+ Returns:
73
+ Iterator[List[FileItem]]: The list of files.
74
+ """
75
+
76
+ if isinstance(credentials, AbstractFileSystem):
77
+ fs_client = credentials
78
+ else:
79
+ fs_client = fsspec_filesystem(bucket_url, credentials)[0]
80
+
81
+ files_chunk: List[FileItem] = []
82
+ for file_model in glob_files(fs_client, bucket_url, file_glob):
83
+ file_dict = FileItemDict(file_model, credentials)
84
+ if extract_content:
85
+ file_dict["file_content"] = file_dict.read_bytes()
86
+ files_chunk.append(file_dict) # type: ignore
87
+ # wait for the chunk to be full
88
+ if len(files_chunk) >= files_per_page:
89
+ yield files_chunk
90
+ files_chunk = []
91
+ if files_chunk:
92
+ yield files_chunk
93
+
94
+
95
+ read_csv = dlt.transformer(standalone=True)(_read_csv)
96
+ read_jsonl = dlt.transformer(standalone=True)(_read_jsonl)
97
+ read_parquet = dlt.transformer(standalone=True)(_read_parquet)
98
+ read_csv_duckdb = dlt.transformer(standalone=True)(_read_csv_duckdb)
@@ -0,0 +1,100 @@
1
+ """Helpers for the filesystem resource."""
2
+
3
+ from typing import Any, Dict, Iterable, List, Optional, Type, Union
4
+
5
+ import dlt
6
+ from dlt.common.configuration import resolve_type
7
+ from dlt.common.typing import TDataItem
8
+ from dlt.sources import DltResource
9
+ from dlt.sources.config import configspec, with_config
10
+ from dlt.sources.credentials import (
11
+ CredentialsConfiguration,
12
+ FilesystemConfiguration,
13
+ FileSystemCredentials,
14
+ )
15
+ from dlt.sources.filesystem import fsspec_filesystem
16
+ from fsspec import AbstractFileSystem # type: ignore
17
+
18
+
19
+ @configspec
20
+ class FilesystemConfigurationResource(FilesystemConfiguration):
21
+ credentials: Union[FileSystemCredentials, AbstractFileSystem] = None
22
+ file_glob: Optional[str] = "*"
23
+ files_per_page: int = 100
24
+ extract_content: bool = False
25
+
26
+ @resolve_type("credentials")
27
+ def resolve_credentials_type(self) -> Type[CredentialsConfiguration]:
28
+ # use known credentials or empty credentials for unknown protocol
29
+ return Union[
30
+ self.PROTOCOL_CREDENTIALS.get(self.protocol)
31
+ or Optional[CredentialsConfiguration],
32
+ AbstractFileSystem,
33
+ ] # type: ignore[return-value]
34
+
35
+
36
+ def fsspec_from_resource(filesystem_instance: DltResource) -> AbstractFileSystem:
37
+ """Extract authorized fsspec client from a filesystem resource"""
38
+
39
+ @with_config(
40
+ spec=FilesystemConfiguration,
41
+ sections=("sources", filesystem_instance.section, filesystem_instance.name),
42
+ )
43
+ def _get_fsspec(
44
+ bucket_url: str, credentials: Optional[FileSystemCredentials]
45
+ ) -> AbstractFileSystem:
46
+ return fsspec_filesystem(bucket_url, credentials)[0]
47
+
48
+ return _get_fsspec(
49
+ filesystem_instance.explicit_args.get("bucket_url", dlt.config.value),
50
+ filesystem_instance.explicit_args.get("credentials", dlt.secrets.value),
51
+ )
52
+
53
+
54
+ def add_columns(columns: List[str], rows: List[List[Any]]) -> List[Dict[str, Any]]:
55
+ """Adds column names to the given rows.
56
+
57
+ Args:
58
+ columns (List[str]): The column names.
59
+ rows (List[List[Any]]): The rows.
60
+
61
+ Returns:
62
+ List[Dict[str, Any]]: The rows with column names.
63
+ """
64
+ result = []
65
+ for row in rows:
66
+ result.append(dict(zip(columns, row)))
67
+
68
+ return result
69
+
70
+
71
+ def fetch_arrow(file_data, chunk_size: int) -> Iterable[TDataItem]: # type: ignore
72
+ """Fetches data from the given CSV file.
73
+
74
+ Args:
75
+ file_data (DuckDBPyRelation): The CSV file data.
76
+ chunk_size (int): The number of rows to read at once.
77
+
78
+ Yields:
79
+ Iterable[TDataItem]: Data items, read from the given CSV file.
80
+ """
81
+ batcher = file_data.fetch_arrow_reader(batch_size=chunk_size)
82
+ yield from batcher
83
+
84
+
85
+ def fetch_json(file_data, chunk_size: int) -> List[Dict[str, Any]]: # type: ignore
86
+ """Fetches data from the given CSV file.
87
+
88
+ Args:
89
+ file_data (DuckDBPyRelation): The CSV file data.
90
+ chunk_size (int): The number of rows to read at once.
91
+
92
+ Yields:
93
+ Iterable[TDataItem]: Data items, read from the given CSV file.
94
+ """
95
+ while True:
96
+ batch = file_data.fetchmany(chunk_size)
97
+ if not batch:
98
+ break
99
+
100
+ yield add_columns(file_data.columns, batch)
@@ -0,0 +1,131 @@
1
+ from typing import TYPE_CHECKING, Any, Iterator, Optional
2
+
3
+ from dlt.common import json
4
+ from dlt.common.typing import copy_sig
5
+ from dlt.sources import DltResource, DltSource, TDataItems
6
+ from dlt.sources.filesystem import FileItemDict
7
+
8
+ from .helpers import fetch_arrow, fetch_json
9
+
10
+
11
+ def _read_csv(
12
+ items: Iterator[FileItemDict], chunksize: int = 10000, **pandas_kwargs: Any
13
+ ) -> Iterator[TDataItems]:
14
+ """Reads csv file with Pandas chunk by chunk.
15
+
16
+ Args:
17
+ chunksize (int): Number of records to read in one chunk
18
+ **pandas_kwargs: Additional keyword arguments passed to Pandas.read_csv
19
+ Returns:
20
+ TDataItem: The file content
21
+ """
22
+ import pandas as pd
23
+
24
+ # apply defaults to pandas kwargs
25
+ kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
26
+
27
+ for file_obj in items:
28
+ # Here we use pandas chunksize to read the file in chunks and avoid loading the whole file
29
+ # in memory.
30
+ with file_obj.open() as file:
31
+ for df in pd.read_csv(file, **kwargs):
32
+ yield df.to_dict(orient="records")
33
+
34
+
35
+ def _read_jsonl(
36
+ items: Iterator[FileItemDict], chunksize: int = 1000
37
+ ) -> Iterator[TDataItems]:
38
+ """Reads jsonl file content and extract the data.
39
+
40
+ Args:
41
+ chunksize (int, optional): The number of JSON lines to load and yield at once, defaults to 1000
42
+
43
+ Returns:
44
+ TDataItem: The file content
45
+ """
46
+ for file_obj in items:
47
+ with file_obj.open() as f:
48
+ lines_chunk = []
49
+ for line in f:
50
+ lines_chunk.append(json.loadb(line))
51
+ if len(lines_chunk) >= chunksize:
52
+ yield lines_chunk
53
+ lines_chunk = []
54
+ if lines_chunk:
55
+ yield lines_chunk
56
+
57
+
58
+ def _read_parquet(
59
+ items: Iterator[FileItemDict],
60
+ chunksize: int = 10,
61
+ ) -> Iterator[TDataItems]:
62
+ """Reads parquet file content and extract the data.
63
+
64
+ Args:
65
+ chunksize (int, optional): The number of files to process at once, defaults to 10.
66
+
67
+ Returns:
68
+ TDataItem: The file content
69
+ """
70
+ from pyarrow import parquet as pq
71
+
72
+ for file_obj in items:
73
+ with file_obj.open() as f:
74
+ parquet_file = pq.ParquetFile(f)
75
+ for rows in parquet_file.iter_batches(batch_size=chunksize):
76
+ yield rows.to_pylist()
77
+
78
+
79
+ def _read_csv_duckdb(
80
+ items: Iterator[FileItemDict],
81
+ chunk_size: Optional[int] = 5000,
82
+ use_pyarrow: bool = False,
83
+ **duckdb_kwargs: Any,
84
+ ) -> Iterator[TDataItems]:
85
+ """A resource to extract data from the given CSV files.
86
+
87
+ Uses DuckDB engine to import and cast CSV data.
88
+
89
+ Args:
90
+ items (Iterator[FileItemDict]): CSV files to read.
91
+ chunk_size (Optional[int]):
92
+ The number of rows to read at once. Defaults to 5000.
93
+ use_pyarrow (bool):
94
+ Whether to use `pyarrow` to read the data and designate
95
+ data schema. If set to False (by default), JSON is used.
96
+ duckdb_kwargs (Dict):
97
+ Additional keyword arguments to pass to the `read_csv()`.
98
+
99
+ Returns:
100
+ Iterable[TDataItem]: Data items, read from the given CSV files.
101
+ """
102
+ import duckdb
103
+
104
+ helper = fetch_arrow if use_pyarrow else fetch_json
105
+
106
+ for item in items:
107
+ with item.open() as f:
108
+ file_data = duckdb.from_csv_auto(f, **duckdb_kwargs) # type: ignore
109
+
110
+ yield from helper(file_data, chunk_size)
111
+
112
+
113
+ if TYPE_CHECKING:
114
+
115
+ class ReadersSource(DltSource):
116
+ """This is a typing stub that provides docstrings and signatures to the resources in `readers" source"""
117
+
118
+ @copy_sig(_read_csv)
119
+ def read_csv(self) -> DltResource: ...
120
+
121
+ @copy_sig(_read_jsonl)
122
+ def read_jsonl(self) -> DltResource: ...
123
+
124
+ @copy_sig(_read_parquet)
125
+ def read_parquet(self) -> DltResource: ...
126
+
127
+ @copy_sig(_read_csv_duckdb)
128
+ def read_csv_duckdb(self) -> DltResource: ...
129
+
130
+ else:
131
+ ReadersSource = DltSource
@@ -1,8 +1,9 @@
1
1
  """Fetches Shopify Orders and Products."""
2
2
 
3
- from typing import Iterable, Optional
3
+ from typing import Any, Dict, Iterable, Optional # noqa: F401
4
4
 
5
5
  import dlt
6
+ from dlt.common import jsonpath as jp # noqa: F401
6
7
  from dlt.common import pendulum
7
8
  from dlt.common.time import ensure_pendulum_datetime
8
9
  from dlt.common.typing import TAnyDateTime, TDataItem
@@ -12,6 +13,7 @@ from .helpers import ShopifyApi, ShopifyGraphQLApi, TOrderStatus
12
13
  from .settings import (
13
14
  DEFAULT_API_VERSION,
14
15
  DEFAULT_ITEMS_PER_PAGE,
16
+ DEFAULT_PARTNER_API_VERSION, # noqa: F401
15
17
  FIRST_DAY_OF_MILLENNIUM,
16
18
  )
17
19