ingestr 0.7.5__tar.gz → 0.7.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- {ingestr-0.7.5 → ingestr-0.7.6}/PKG-INFO +11 -1
- {ingestr-0.7.5 → ingestr-0.7.6}/README.md +10 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/.vitepress/config.mjs +26 -8
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/commands/example-uris.md +1 -1
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/getting-started/quickstart.md +1 -1
- ingestr-0.7.6/docs/supported-sources/chess.md +37 -0
- ingestr-0.7.6/docs/supported-sources/hubspot.md +45 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/main.py +23 -2
- ingestr-0.7.6/ingestr/src/chess/__init__.py +166 -0
- ingestr-0.7.6/ingestr/src/chess/helpers.py +21 -0
- ingestr-0.7.6/ingestr/src/chess/settings.py +4 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/factory.py +6 -1
- ingestr-0.7.6/ingestr/src/hubspot/__init__.py +281 -0
- ingestr-0.7.6/ingestr/src/hubspot/helpers.py +188 -0
- ingestr-0.7.6/ingestr/src/hubspot/settings.py +99 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/sources.py +81 -0
- ingestr-0.7.6/ingestr/src/version.py +1 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/package-lock.json +233 -118
- {ingestr-0.7.5 → ingestr-0.7.6}/package.json +1 -1
- {ingestr-0.7.5 → ingestr-0.7.6}/pyproject.toml +5 -1
- ingestr-0.7.5/docs/supported-sources/overview.md +0 -109
- ingestr-0.7.5/ingestr/src/version.py +0 -1
- {ingestr-0.7.5 → ingestr-0.7.6}/.dockerignore +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/.github/workflows/tests.yml +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/.gitignore +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/.python-version +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/Dockerfile +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/LICENSE.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/Makefile +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/commands/ingest.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/index.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/gorgias.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/gsheets.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/mongodb.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/notion.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/sap-hana.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/shopify.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/snowflake.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/docs/supported-sources/stripe.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/destinations.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/google_sheets/README.md +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/google_sheets/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/google_sheets/helpers/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/google_sheets/helpers/api_calls.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/google_sheets/helpers/data_processing.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/gorgias/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/gorgias/helpers.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/mongodb/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/mongodb/helpers.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/notion/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/notion/helpers/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/notion/helpers/client.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/notion/helpers/database.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/notion/settings.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/shopify/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/shopify/exceptions.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/shopify/helpers.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/shopify/settings.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/sql_database/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/sql_database/arrow_helpers.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/sql_database/helpers.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/sql_database/override.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/sql_database/schema_types.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/stripe_analytics/__init__.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/stripe_analytics/helpers.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/stripe_analytics/settings.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/table_definition.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/create_replace.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/delete_insert_expected.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/delete_insert_part1.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/delete_insert_part2.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/merge_expected.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/merge_part1.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/ingestr/testdata/merge_part2.csv +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/requirements-dev.txt +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/requirements.txt +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/resources/demo.gif +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/resources/demo.tape +0 -0
- {ingestr-0.7.5 → ingestr-0.7.6}/resources/ingestr.svg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.6
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -173,6 +173,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
173
173
|
<tr>
|
|
174
174
|
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
175
175
|
</tr>
|
|
176
|
+
<tr>
|
|
177
|
+
<td>Chess.com</td>
|
|
178
|
+
<td>✅</td>
|
|
179
|
+
<td>-</td>
|
|
180
|
+
</tr>
|
|
176
181
|
<tr>
|
|
177
182
|
<td>Gorgias</td>
|
|
178
183
|
<td>✅</td>
|
|
@@ -183,6 +188,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
183
188
|
<td>✅</td>
|
|
184
189
|
<td>-</td>
|
|
185
190
|
</tr>
|
|
191
|
+
<tr>
|
|
192
|
+
<td>HubSpot</td>
|
|
193
|
+
<td>✅</td>
|
|
194
|
+
<td>-</td>
|
|
195
|
+
</tr>
|
|
186
196
|
<tr>
|
|
187
197
|
<td>Notion</td>
|
|
188
198
|
<td>✅</td>
|
|
@@ -128,6 +128,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
128
128
|
<tr>
|
|
129
129
|
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
130
130
|
</tr>
|
|
131
|
+
<tr>
|
|
132
|
+
<td>Chess.com</td>
|
|
133
|
+
<td>✅</td>
|
|
134
|
+
<td>-</td>
|
|
135
|
+
</tr>
|
|
131
136
|
<tr>
|
|
132
137
|
<td>Gorgias</td>
|
|
133
138
|
<td>✅</td>
|
|
@@ -138,6 +143,11 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
138
143
|
<td>✅</td>
|
|
139
144
|
<td>-</td>
|
|
140
145
|
</tr>
|
|
146
|
+
<tr>
|
|
147
|
+
<td>HubSpot</td>
|
|
148
|
+
<td>✅</td>
|
|
149
|
+
<td>-</td>
|
|
150
|
+
</tr>
|
|
141
151
|
<tr>
|
|
142
152
|
<td>Notion</td>
|
|
143
153
|
<td>✅</td>
|
|
@@ -6,7 +6,13 @@ export default defineConfig({
|
|
|
6
6
|
description: "Ingest & copy data between any source and any destination",
|
|
7
7
|
base: "/ingestr/",
|
|
8
8
|
head: [
|
|
9
|
-
[
|
|
9
|
+
[
|
|
10
|
+
"script",
|
|
11
|
+
{
|
|
12
|
+
async: "",
|
|
13
|
+
src: "https://www.googletagmanager.com/gtag/js?id=G-MZJ20PP4MJ",
|
|
14
|
+
},
|
|
15
|
+
],
|
|
10
16
|
[
|
|
11
17
|
"script",
|
|
12
18
|
{},
|
|
@@ -22,7 +28,7 @@ export default defineConfig({
|
|
|
22
28
|
{ text: "Home", link: "/" },
|
|
23
29
|
{ text: "Getting started", link: "/getting-started/quickstart.md" },
|
|
24
30
|
],
|
|
25
|
-
outline:
|
|
31
|
+
outline: "deep",
|
|
26
32
|
|
|
27
33
|
sidebar: [
|
|
28
34
|
{
|
|
@@ -30,7 +36,10 @@ export default defineConfig({
|
|
|
30
36
|
items: [
|
|
31
37
|
{ text: "Quickstart", link: "/getting-started/quickstart.md" },
|
|
32
38
|
{ text: "Core Concepts", link: "/getting-started/core-concepts.md" },
|
|
33
|
-
{
|
|
39
|
+
{
|
|
40
|
+
text: "Incremental Loading",
|
|
41
|
+
link: "/getting-started/incremental-loading.md",
|
|
42
|
+
},
|
|
34
43
|
{ text: "Telemetry", link: "/getting-started/telemetry.md" },
|
|
35
44
|
],
|
|
36
45
|
},
|
|
@@ -44,7 +53,6 @@ export default defineConfig({
|
|
|
44
53
|
{
|
|
45
54
|
text: "Sources & Destinations",
|
|
46
55
|
items: [
|
|
47
|
-
{ text: "Overview", link: "/supported-sources/overview.md" },
|
|
48
56
|
{
|
|
49
57
|
text: "Databases",
|
|
50
58
|
collapsed: false,
|
|
@@ -52,9 +60,15 @@ export default defineConfig({
|
|
|
52
60
|
{ text: "AWS Redshift", link: "/supported-sources/redshift.md" },
|
|
53
61
|
{ text: "Databricks", link: "/supported-sources/databricks.md" },
|
|
54
62
|
{ text: "DuckDB", link: "/supported-sources/duckdb.md" },
|
|
55
|
-
{
|
|
63
|
+
{
|
|
64
|
+
text: "Google BigQuery",
|
|
65
|
+
link: "/supported-sources/bigquery.md",
|
|
66
|
+
},
|
|
56
67
|
{ text: "Local CSV Files", link: "/supported-sources/csv.md" },
|
|
57
|
-
{
|
|
68
|
+
{
|
|
69
|
+
text: "Microsoft SQL Server",
|
|
70
|
+
link: "/supported-sources/mssql.md",
|
|
71
|
+
},
|
|
58
72
|
{ text: "MongoDB", link: "/supported-sources/mongodb.md" },
|
|
59
73
|
{ text: "MySQL", link: "/supported-sources/mysql.md" },
|
|
60
74
|
{ text: "Oracle", link: "/supported-sources/oracle.md" },
|
|
@@ -69,8 +83,10 @@ export default defineConfig({
|
|
|
69
83
|
text: "Platforms",
|
|
70
84
|
collapsed: false,
|
|
71
85
|
items: [
|
|
72
|
-
{ text: "
|
|
86
|
+
{ text: "Chess.com", link: "/supported-sources/chess.md" },
|
|
73
87
|
{ text: "Google Sheets", link: "/supported-sources/gsheets.md" },
|
|
88
|
+
{ text: "Gorgias", link: "/supported-sources/gorgias.md" },
|
|
89
|
+
{ text: "HubSpot", link: "/supported-sources/hubspot.md" },
|
|
74
90
|
{ text: "Notion", link: "/supported-sources/notion.md" },
|
|
75
91
|
{ text: "Shopify", link: "/supported-sources/shopify.md" },
|
|
76
92
|
{ text: "Stripe", link: "/supported-sources/stripe.md" },
|
|
@@ -80,6 +96,8 @@ export default defineConfig({
|
|
|
80
96
|
},
|
|
81
97
|
],
|
|
82
98
|
|
|
83
|
-
socialLinks: [
|
|
99
|
+
socialLinks: [
|
|
100
|
+
{ icon: "github", link: "https://github.com/bruin-data/ingestr" },
|
|
101
|
+
],
|
|
84
102
|
},
|
|
85
103
|
});
|
|
@@ -2,4 +2,4 @@
|
|
|
2
2
|
|
|
3
3
|
This command is supposed to serve as a guide for the user to understand the various URI formats that are supported by the `ingestr` tool. The command will provide a list of supported sources and destinations, along with the URI format for each of them.
|
|
4
4
|
|
|
5
|
-
For the detailed documentation, please refer to the
|
|
5
|
+
For the detailed documentation, please refer to the Sources & Destinations section on the sidebar.
|
|
@@ -36,4 +36,4 @@ This command will:
|
|
|
36
36
|
|
|
37
37
|
## Supported Sources & Destinations
|
|
38
38
|
|
|
39
|
-
See the
|
|
39
|
+
See the Supported Sources & Destinations page for a list of all supported sources and destinations. More to come soon!
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Chess.com
|
|
2
|
+
|
|
3
|
+
[Chess.com](https://www.chess.com/) is an online platform offering chess games, tournaments, lessons, and more.
|
|
4
|
+
|
|
5
|
+
ingestr supports Chess.com as a source, primarily to play around with the data of players, games, and more since it doesn't require any authentication.
|
|
6
|
+
|
|
7
|
+
## URI Format
|
|
8
|
+
|
|
9
|
+
The URI format for Chess is as follows:
|
|
10
|
+
|
|
11
|
+
```plaintext
|
|
12
|
+
--source-uri 'chess://?players=<List[str]>'
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
URI parameter:
|
|
16
|
+
|
|
17
|
+
- `players`: A list of players usernames for which you want to fetch data. If no usernames are provided, then data of 4 different players will be fetched.
|
|
18
|
+
|
|
19
|
+
## Setting up a Chess Integration
|
|
20
|
+
|
|
21
|
+
Let's say you have a list of player usernames: max2 and peter23. Here's a sample command that will copy the data from Chess into a DuckDB database:
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
ingestr ingest --source-uri 'chess://?players=max2,peter23' --source-table 'profiles' --dest-uri 'duckdb:///chess.duckdb' --dest-table 'players.profiles'
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
The result of this command will be a table in the `chess.duckdb` database.
|
|
28
|
+
|
|
29
|
+
## Available Tables
|
|
30
|
+
|
|
31
|
+
Chess source allows ingesting the following sources into separate tables:
|
|
32
|
+
|
|
33
|
+
- `profiles`: Retrives player profiles based on a list of player usernames.
|
|
34
|
+
- `games`: Retrives players' games for specified players.
|
|
35
|
+
- `archives`: Retrives url to game archives for specified players.
|
|
36
|
+
|
|
37
|
+
Use these as `--source-table` parameter in the `ingestr ingest` command.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# HubSpot
|
|
2
|
+
|
|
3
|
+
[HubSpot](https://www.hubspot.com/) is a customer relationship management software that helps businesses attract visitors, connect with customers, and close deals.
|
|
4
|
+
|
|
5
|
+
ingestr supports HubSpot as a source.
|
|
6
|
+
|
|
7
|
+
## URI Format
|
|
8
|
+
|
|
9
|
+
The URI format for HubSpot is as follows:
|
|
10
|
+
|
|
11
|
+
```plaintext
|
|
12
|
+
hubspot://?api_key=<api-key-here>
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
URI parameters:
|
|
16
|
+
|
|
17
|
+
- `api_key`: The API key is used for authentication with the HubSpot API.
|
|
18
|
+
|
|
19
|
+
The URI is used to connect to the HubSpot API for extracting data.
|
|
20
|
+
|
|
21
|
+
## Setting up a HubSpot Integration
|
|
22
|
+
|
|
23
|
+
Hubspot requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/hubspot#setup-guide).
|
|
24
|
+
|
|
25
|
+
Once you complete the guide, you should have an API key. Let's say your API key is `pat_test_12345`, here's a sample command that will copy the data from HubSpot into a duckdb database:
|
|
26
|
+
|
|
27
|
+
```sh
|
|
28
|
+
ingestr ingest --source-uri 'hubspot://?api_key=pat_test_12345' --source-table 'companies' --dest-uri duckdb:///hubspot.duckdb --dest-table 'companies.data'
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The result of this command will be a table in the `hubspot.duckdb` database.
|
|
32
|
+
|
|
33
|
+
## Available Tables
|
|
34
|
+
|
|
35
|
+
HubSpot source allows ingesting the following sources into separate tables:
|
|
36
|
+
|
|
37
|
+
- `companies`: Retrieves information about organizations.
|
|
38
|
+
- `deals`: Retrieves deal records and tracks deal progress.
|
|
39
|
+
- `products`: Retrieves pricing information of products.
|
|
40
|
+
- `tickets`: Handles requests for help from customers or users.
|
|
41
|
+
- `quotes`: Retrieves price proposals that salespeople can create and send to their contacts.
|
|
42
|
+
- `hubspot_events_for_objects`: Retrieves web analytics events for a given object type and object IDs.
|
|
43
|
+
- `contacts`: Retrieves information about visitors, potential customers, and leads.
|
|
44
|
+
|
|
45
|
+
Use these as `--source-table` parameter in the `ingestr ingest` command.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from enum import Enum
|
|
4
|
+
import tempfile
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
7
|
import dlt
|
|
@@ -236,6 +237,13 @@ def ingest(
|
|
|
236
237
|
envvar="SCHEMA_NAMING",
|
|
237
238
|
),
|
|
238
239
|
] = SchemaNaming.default, # type: ignore
|
|
240
|
+
pipelines_dir: Annotated[
|
|
241
|
+
Optional[str],
|
|
242
|
+
typer.Option(
|
|
243
|
+
help="The path to store dlt-related pipeline metadata. By default, ingestr will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
|
|
244
|
+
envvar="PIPELINES_DIR",
|
|
245
|
+
),
|
|
246
|
+
] = None, # type: ignore
|
|
239
247
|
):
|
|
240
248
|
track(
|
|
241
249
|
"command_triggered",
|
|
@@ -280,13 +288,18 @@ def ingest(
|
|
|
280
288
|
if progress == Progress.log:
|
|
281
289
|
progressInstance = LogCollector(dump_system_stats=False)
|
|
282
290
|
|
|
291
|
+
is_pipelines_dir_temp = False
|
|
292
|
+
if pipelines_dir is None:
|
|
293
|
+
pipelines_dir = tempfile.mkdtemp()
|
|
294
|
+
is_pipelines_dir_temp = True
|
|
295
|
+
|
|
283
296
|
pipeline = dlt.pipeline(
|
|
284
297
|
pipeline_name=m.hexdigest(),
|
|
285
298
|
destination=destination.dlt_dest(
|
|
286
299
|
uri=dest_uri,
|
|
287
300
|
),
|
|
288
301
|
progress=progressInstance,
|
|
289
|
-
pipelines_dir=
|
|
302
|
+
pipelines_dir=pipelines_dir,
|
|
290
303
|
refresh="drop_resources" if full_refresh else None,
|
|
291
304
|
)
|
|
292
305
|
|
|
@@ -362,6 +375,8 @@ def ingest(
|
|
|
362
375
|
if incremental_strategy != IncrementalStrategy.none:
|
|
363
376
|
write_disposition = incremental_strategy.value
|
|
364
377
|
|
|
378
|
+
start_time = datetime.now()
|
|
379
|
+
|
|
365
380
|
run_info: LoadInfo = pipeline.run(
|
|
366
381
|
dlt_source,
|
|
367
382
|
**destination.dlt_run_params(
|
|
@@ -389,11 +404,17 @@ def ingest(
|
|
|
389
404
|
|
|
390
405
|
destination.post_load()
|
|
391
406
|
|
|
407
|
+
end_time = datetime.now()
|
|
392
408
|
elapsedHuman = ""
|
|
393
409
|
if run_info.started_at:
|
|
394
|
-
elapsed =
|
|
410
|
+
elapsed = end_time - start_time
|
|
395
411
|
elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
|
|
396
412
|
|
|
413
|
+
# remove the pipelines_dir folder if it was created by ingestr
|
|
414
|
+
if is_pipelines_dir_temp:
|
|
415
|
+
import shutil
|
|
416
|
+
shutil.rmtree(pipelines_dir)
|
|
417
|
+
|
|
397
418
|
print(
|
|
398
419
|
f"[bold green]Successfully finished loading data from '{factory.source_scheme}' to '{factory.destination_scheme}' {elapsedHuman} [/bold green]"
|
|
399
420
|
)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""A source loading player profiles and games from chess.com api"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Dict, Iterator, List, Sequence
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.common import pendulum
|
|
7
|
+
from dlt.common.typing import TDataItem
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
from dlt.sources.helpers import requests
|
|
10
|
+
|
|
11
|
+
from .helpers import get_path_with_retry, get_url_with_retry, validate_month_string
|
|
12
|
+
from .settings import UNOFFICIAL_CHESS_API_URL
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dlt.source(name="chess")
|
|
16
|
+
def source(
|
|
17
|
+
players: List[str], start_month: str = None, end_month: str = None
|
|
18
|
+
) -> Sequence[DltResource]:
|
|
19
|
+
"""
|
|
20
|
+
A dlt source for the chess.com api. It groups several resources (in this case chess.com API endpoints) containing
|
|
21
|
+
various types of data: user profiles or chess match results
|
|
22
|
+
Args:
|
|
23
|
+
players (List[str]): A list of the player usernames for which to get the data.
|
|
24
|
+
start_month (str, optional): Filters out all the matches happening before `start_month`. Defaults to None.
|
|
25
|
+
end_month (str, optional): Filters out all the matches happening after `end_month`. Defaults to None.
|
|
26
|
+
Returns:
|
|
27
|
+
Sequence[DltResource]: A sequence of resources that can be selected from including players_profiles,
|
|
28
|
+
players_archives, players_games, players_online_status
|
|
29
|
+
"""
|
|
30
|
+
return (
|
|
31
|
+
players_profiles(players),
|
|
32
|
+
players_archives(players),
|
|
33
|
+
players_games(players, start_month=start_month, end_month=end_month),
|
|
34
|
+
players_online_status(players),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dlt.resource(
|
|
39
|
+
write_disposition="replace",
|
|
40
|
+
columns={
|
|
41
|
+
"last_online": {"data_type": "timestamp"},
|
|
42
|
+
"joined": {"data_type": "timestamp"},
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
def players_profiles(players: List[str]) -> Iterator[TDataItem]:
|
|
46
|
+
"""
|
|
47
|
+
Yields player profiles for a list of player usernames.
|
|
48
|
+
Args:
|
|
49
|
+
players (List[str]): List of player usernames to retrieve profiles for.
|
|
50
|
+
Yields:
|
|
51
|
+
Iterator[TDataItem]: An iterator over player profiles data.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# get archives in parallel by decorating the http request with defer
|
|
55
|
+
@dlt.defer
|
|
56
|
+
def _get_profile(username: str) -> TDataItem:
|
|
57
|
+
return get_path_with_retry(f"player/{username}")
|
|
58
|
+
|
|
59
|
+
for username in players:
|
|
60
|
+
yield _get_profile(username)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dlt.resource(write_disposition="replace", selected=False)
|
|
64
|
+
def players_archives(players: List[str]) -> Iterator[List[TDataItem]]:
|
|
65
|
+
"""
|
|
66
|
+
Yields url to game archives for specified players.
|
|
67
|
+
Args:
|
|
68
|
+
players (List[str]): List of player usernames to retrieve archives for.
|
|
69
|
+
Yields:
|
|
70
|
+
Iterator[List[TDataItem]]: An iterator over list of player archive data.
|
|
71
|
+
"""
|
|
72
|
+
for username in players:
|
|
73
|
+
data = get_path_with_retry(f"player/{username}/games/archives")
|
|
74
|
+
yield data.get("archives", [])
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dlt.resource(
|
|
78
|
+
write_disposition="append", columns={"end_time": {"data_type": "timestamp"}}
|
|
79
|
+
)
|
|
80
|
+
def players_games(
|
|
81
|
+
players: List[str], start_month: str = None, end_month: str = None
|
|
82
|
+
) -> Iterator[Callable[[], List[TDataItem]]]:
|
|
83
|
+
"""
|
|
84
|
+
Yields `players` games that happened between `start_month` and `end_month`.
|
|
85
|
+
Args:
|
|
86
|
+
players (List[str]): List of player usernames to retrieve games for.
|
|
87
|
+
start_month (str, optional): The starting month in the format "YYYY/MM". Defaults to None.
|
|
88
|
+
end_month (str, optional): The ending month in the format "YYYY/MM". Defaults to None.
|
|
89
|
+
Yields:
|
|
90
|
+
Iterator[Callable[[], List[TDataItem]]]: An iterator over callables that return a list of games for each player.
|
|
91
|
+
""" # do a simple validation to prevent common mistakes in month format
|
|
92
|
+
validate_month_string(start_month)
|
|
93
|
+
validate_month_string(end_month)
|
|
94
|
+
|
|
95
|
+
# get a list of already checked archives
|
|
96
|
+
# from your point of view, the state is python dictionary that will have the same content the next time this function is called
|
|
97
|
+
checked_archives = dlt.current.resource_state().setdefault("archives", [])
|
|
98
|
+
# get player archives, note that you can call the resource like any other function and just iterate it like a list
|
|
99
|
+
archives = players_archives(players)
|
|
100
|
+
|
|
101
|
+
# get archives in parallel by decorating the http request with defer
|
|
102
|
+
@dlt.defer
|
|
103
|
+
def _get_archive(url: str) -> List[TDataItem]:
|
|
104
|
+
try:
|
|
105
|
+
games = get_url_with_retry(url).get("games", [])
|
|
106
|
+
return games # type: ignore
|
|
107
|
+
except requests.HTTPError as http_err:
|
|
108
|
+
# sometimes archives are not available and the error seems to be permanent
|
|
109
|
+
if http_err.response.status_code == 404:
|
|
110
|
+
return []
|
|
111
|
+
raise
|
|
112
|
+
|
|
113
|
+
# enumerate the archives
|
|
114
|
+
for url in archives:
|
|
115
|
+
# the `url` format is https://api.chess.com/pub/player/{username}/games/{YYYY}/{MM}
|
|
116
|
+
if start_month and url[-7:] < start_month:
|
|
117
|
+
continue
|
|
118
|
+
if end_month and url[-7:] > end_month:
|
|
119
|
+
continue
|
|
120
|
+
# do not download archive again
|
|
121
|
+
if url in checked_archives:
|
|
122
|
+
continue
|
|
123
|
+
checked_archives.append(url)
|
|
124
|
+
# get the filtered archive
|
|
125
|
+
yield _get_archive(url)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dlt.resource(write_disposition="append")
|
|
129
|
+
def players_online_status(players: List[str]) -> Iterator[TDataItem]:
|
|
130
|
+
"""
|
|
131
|
+
Returns current online status for a list of players.
|
|
132
|
+
Args:
|
|
133
|
+
players (List[str]): List of player usernames to check online status for.
|
|
134
|
+
Yields:
|
|
135
|
+
Iterator[TDataItem]: An iterator over the online status of each player.
|
|
136
|
+
"""
|
|
137
|
+
# we'll use unofficial endpoint to get online status, the official seems to be removed
|
|
138
|
+
for player in players:
|
|
139
|
+
status = get_url_with_retry(f"{UNOFFICIAL_CHESS_API_URL}user/popup/{player}")
|
|
140
|
+
# return just relevant selection
|
|
141
|
+
yield {
|
|
142
|
+
"username": player,
|
|
143
|
+
"onlineStatus": status["onlineStatus"],
|
|
144
|
+
"lastLoginDate": status["lastLoginDate"],
|
|
145
|
+
"check_time": pendulum.now(), # dlt can deal with native python dates
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dlt.source
|
|
150
|
+
def chess_dlt_config_example(
|
|
151
|
+
secret_str: str = dlt.secrets.value,
|
|
152
|
+
secret_dict: Dict[str, Any] = dlt.secrets.value,
|
|
153
|
+
config_int: int = dlt.config.value,
|
|
154
|
+
) -> DltResource:
|
|
155
|
+
"""
|
|
156
|
+
An example of a source that uses dlt to provide secrets and config values.
|
|
157
|
+
Args:
|
|
158
|
+
secret_str (str, optional): Secret string provided by dlt.secrets.value. Defaults to dlt.secrets.value.
|
|
159
|
+
secret_dict (Dict[str, Any], optional): Secret dictionary provided by dlt.secrets.value. Defaults to dlt.secrets.value.
|
|
160
|
+
config_int (int, optional): Config integer provided by dlt.config.value. Defaults to dlt.config.value.
|
|
161
|
+
Returns:
|
|
162
|
+
DltResource: Returns a resource yielding the configured values.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
# returns a resource yielding the configured values - it is just a test
|
|
166
|
+
return dlt.resource([secret_str, secret_dict, config_int], name="config_values")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Chess source helpers"""
|
|
2
|
+
|
|
3
|
+
from dlt.common.typing import StrAny
|
|
4
|
+
from dlt.sources.helpers import requests
|
|
5
|
+
|
|
6
|
+
from .settings import OFFICIAL_CHESS_API_URL
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_url_with_retry(url: str) -> StrAny:
|
|
10
|
+
r = requests.get(url)
|
|
11
|
+
return r.json() # type: ignore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_path_with_retry(path: str) -> StrAny:
|
|
15
|
+
return get_url_with_retry(f"{OFFICIAL_CHESS_API_URL}{path}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def validate_month_string(string: str) -> None:
|
|
19
|
+
"""Validates that the string is in YYYY/MM format"""
|
|
20
|
+
if string and string[4] != "/":
|
|
21
|
+
raise ValueError(string)
|
|
@@ -15,8 +15,10 @@ from ingestr.src.destinations import (
|
|
|
15
15
|
SynapseDestination,
|
|
16
16
|
)
|
|
17
17
|
from ingestr.src.sources import (
|
|
18
|
+
ChessSource,
|
|
18
19
|
GoogleSheetsSource,
|
|
19
20
|
GorgiasSource,
|
|
21
|
+
HubspotSource,
|
|
20
22
|
LocalCsvSource,
|
|
21
23
|
MongoDbSource,
|
|
22
24
|
NotionSource,
|
|
@@ -103,9 +105,12 @@ class SourceDestinationFactory:
|
|
|
103
105
|
return ShopifySource()
|
|
104
106
|
elif self.source_scheme == "gorgias":
|
|
105
107
|
return GorgiasSource()
|
|
108
|
+
elif self.source_scheme == "chess":
|
|
109
|
+
return ChessSource()
|
|
106
110
|
elif self.source_scheme == "stripe":
|
|
107
111
|
return StripeAnalyticsSource()
|
|
108
|
-
|
|
112
|
+
elif self.source_scheme == "hubspot":
|
|
113
|
+
return HubspotSource()
|
|
109
114
|
else:
|
|
110
115
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
111
116
|
|