ingestr 0.12.3__tar.gz → 0.12.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- {ingestr-0.12.3 → ingestr-0.12.4}/.githooks/pre-commit-hook.sh +1 -1
- {ingestr-0.12.3 → ingestr-0.12.4}/PKG-INFO +2 -1
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/.vitepress/config.mjs +2 -0
- ingestr-0.12.4/docs/commands/ingest.md +110 -0
- ingestr-0.12.4/docs/media/github.png +0 -0
- ingestr-0.12.4/docs/media/googleanalytics.png +0 -0
- ingestr-0.12.4/docs/supported-sources/github.md +49 -0
- ingestr-0.12.4/docs/supported-sources/google_analytics.md +44 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/main.py +51 -4
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/factory.py +4 -0
- ingestr-0.12.4/ingestr/src/github/__init__.py +149 -0
- ingestr-0.12.4/ingestr/src/github/helpers.py +193 -0
- ingestr-0.12.4/ingestr/src/github/queries.py +115 -0
- ingestr-0.12.4/ingestr/src/github/settings.py +10 -0
- ingestr-0.12.4/ingestr/src/google_analytics/__init__.py +70 -0
- ingestr-0.12.4/ingestr/src/google_analytics/helpers/__init__.py +70 -0
- ingestr-0.12.4/ingestr/src/google_analytics/helpers/data_processing.py +176 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/sources.py +106 -3
- ingestr-0.12.4/ingestr/src/version.py +1 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/pyproject.toml +4 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/requirements.txt +2 -1
- ingestr-0.12.3/docs/commands/ingest.md +0 -44
- ingestr-0.12.3/ingestr/src/version.py +0 -1
- {ingestr-0.12.3 → ingestr-0.12.4}/.dockerignore +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.github/workflows/secrets-scan.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.github/workflows/tests.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.gitignore +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.gitleaksignore +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.python-version +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/.vale.ini +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/Dockerfile +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/LICENSE.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/Makefile +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/README.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/commands/example-uris.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/getting-started/quickstart.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/index.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/media/athena.png +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/media/tiktok.png +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/adjust.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/airtable.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/appsflyer.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/asana.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/athena.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/chess.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/custom_queries.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/dynamodb.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/facebook-ads.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/gorgias.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/gsheets.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/hubspot.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/kafka.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/klaviyo.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/mongodb.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/notion.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/s3.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/sap-hana.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/shopify.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/slack.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/snowflake.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/stripe.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/tiktok-ads.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/docs/supported-sources/zendesk.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/.gitignore +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/adjust/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/adjust/adjust_helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/airtable/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/appsflyer/_init_.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/appsflyer/client.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/arrow/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/asana_source/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/asana_source/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/asana_source/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/chess/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/chess/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/chess/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/destinations.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/dynamodb/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/facebook_ads/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/facebook_ads/exceptions.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/facebook_ads/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/facebook_ads/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/filesystem/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/filesystem/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/filesystem/readers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/filters.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/google_sheets/README.md +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/google_sheets/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/google_sheets/helpers/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/google_sheets/helpers/api_calls.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/google_sheets/helpers/data_processing.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/gorgias/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/gorgias/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/hubspot/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/hubspot/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/hubspot/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/kafka/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/kafka/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/klaviyo/_init_.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/klaviyo/client.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/klaviyo/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/mongodb/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/mongodb/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/notion/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/notion/helpers/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/notion/helpers/client.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/notion/helpers/database.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/notion/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/shopify/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/shopify/exceptions.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/shopify/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/shopify/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/slack/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/slack/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/slack/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/stripe_analytics/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/stripe_analytics/helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/stripe_analytics/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/table_definition.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/tiktok_ads/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/tiktok_ads/tiktok_helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/time.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/zendesk/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/zendesk/helpers/__init__.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/zendesk/helpers/api_helpers.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/zendesk/helpers/credentials.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/zendesk/helpers/talk_api.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/src/zendesk/settings.py +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/create_replace.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/delete_insert_expected.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/delete_insert_part1.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/delete_insert_part2.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/merge_expected.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/merge_part1.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/ingestr/testdata/merge_part2.csv +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/package-lock.json +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/package.json +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/requirements-dev.txt +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/resources/demo.gif +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/resources/demo.tape +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/resources/ingestr.svg +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/AMPM.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Acronyms.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Colons.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Contractions.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/DateFormat.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Ellipses.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/EmDash.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Exclamation.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/FirstPerson.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Gender.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/GenderBias.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/HeadingPunctuation.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Headings.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Latin.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/LyHyphens.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/OptionalPlurals.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Ordinal.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/OxfordComma.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Parens.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Passive.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Periods.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Quotes.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Ranges.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Semicolons.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Slang.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Spacing.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Spelling.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Units.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/We.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/Will.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/WordList.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/meta.json +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/Google/vocab.txt +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/bruin/Ingestr.yml +0 -0
- {ingestr-0.12.3 → ingestr-0.12.4}/styles/config/vocabularies/bruin/accept.txt +0 -0
|
@@ -17,7 +17,7 @@ CMD="gitleaks dir -v"
|
|
|
17
17
|
|
|
18
18
|
if [[ ! `which gitleaks` ]]; then
|
|
19
19
|
which docker > /dev/null || (echo "gitleaks or docker is required for running secrets scan." && exit 1)
|
|
20
|
-
CMD="docker run -v $PWD:$WORK_DIR -w $WORK_DIR ghcr.io/gitleaks/gitleaks:latest dir -v"
|
|
20
|
+
CMD="docker run -v $PWD:$WORK_DIR -w $WORK_DIR --rm ghcr.io/gitleaks/gitleaks:latest dir -v"
|
|
21
21
|
fi
|
|
22
22
|
|
|
23
23
|
$CMD || secret_detected
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.4
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -21,6 +21,7 @@ Requires-Dist: dlt==1.4.0
|
|
|
21
21
|
Requires-Dist: duckdb-engine==0.13.5
|
|
22
22
|
Requires-Dist: duckdb==1.1.3
|
|
23
23
|
Requires-Dist: facebook-business==20.0.0
|
|
24
|
+
Requires-Dist: google-analytics-data==0.18.15
|
|
24
25
|
Requires-Dist: google-api-python-client==2.130.0
|
|
25
26
|
Requires-Dist: google-cloud-bigquery-storage==2.24.0
|
|
26
27
|
Requires-Dist: mysql-connector-python==9.1.0
|
|
@@ -101,6 +101,8 @@ export default defineConfig({
|
|
|
101
101
|
text: "Facebook Ads",
|
|
102
102
|
link: "/supported-sources/facebook-ads.md",
|
|
103
103
|
},
|
|
104
|
+
{ text: "Google Analytics", link: "/supported-sources/google_analytics.md" },
|
|
105
|
+
{ text: "GitHub", link: "/supported-sources/github.md" },
|
|
104
106
|
{ text: "Google Sheets", link: "/supported-sources/gsheets.md" },
|
|
105
107
|
{ text: "Gorgias", link: "/supported-sources/gorgias.md" },
|
|
106
108
|
{ text: "HubSpot", link: "/supported-sources/hubspot.md" },
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# `ingestr ingest`
|
|
2
|
+
|
|
3
|
+
The `ingest` command is a core feature of the `ingestr` tool, allowing users to transfer data from a source to a destination with optional support for incremental updates.
|
|
4
|
+
|
|
5
|
+
## Example
|
|
6
|
+
|
|
7
|
+
The following example demonstrates how to use the `ingest` command to transfer data from a source to a destination.
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
ingestr ingest \
|
|
11
|
+
--source-uri '<your-source-uri-here>' \
|
|
12
|
+
--source-table '<your-schema>.<your-table>' \
|
|
13
|
+
--dest-uri '<your-destination-uri-here>'
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Required flags
|
|
17
|
+
|
|
18
|
+
- `--source-uri TEXT`: Required. Specifies the URI of the data source.
|
|
19
|
+
- `--dest-uri TEXT`: Required. Specifies the URI of the destination where data will be ingested.
|
|
20
|
+
- `--source-table TEXT`: Required. Defines the source table to fetch data from.
|
|
21
|
+
|
|
22
|
+
## Optional flags
|
|
23
|
+
|
|
24
|
+
- `--dest-table TEXT`: Designates the destination table to save the data. If not specified, defaults to the value of `--source-table`.
|
|
25
|
+
- `--incremental-key TEXT`: Identifies the key used for incremental data strategies. Defaults to `None`.
|
|
26
|
+
- `--incremental-strategy TEXT`: Defines the strategy for incremental updates. Options include `replace`, `append`, `delete+insert`, or `merge`. The default strategy is `replace`.
|
|
27
|
+
- `--interval-start`: Sets the start of the interval for the incremental key. Defaults to `None`.
|
|
28
|
+
- `--interval-end`: Sets the end of the interval for the incremental key. Defaults to `None`.
|
|
29
|
+
- `--primary-key TEXT`: Specifies the primary key for the merge operation. Defaults to `None`.
|
|
30
|
+
- `--columns <column_name>:<column_type>`: Specifies the columns to be ingested. Defaults to `None`.
|
|
31
|
+
|
|
32
|
+
The `interval-start` and `interval-end` options support various datetime formats, here are some examples:
|
|
33
|
+
- `%Y-%m-%d`: `2023-01-31`
|
|
34
|
+
- `%Y-%m-%dT%H:%M:%S`: `2023-01-31T15:00:00`
|
|
35
|
+
- `%Y-%m-%dT%H:%M:%S%z`: `2023-01-31T15:00:00+00:00`
|
|
36
|
+
- `%Y-%m-%dT%H:%M:%S.%f`: `2023-01-31T15:00:00.000123`
|
|
37
|
+
- `%Y-%m-%dT%H:%M:%S.%f%z`: `2023-01-31T15:00:00.000123+00:00`
|
|
38
|
+
|
|
39
|
+
> [!INFO]
|
|
40
|
+
> For the details around the incremental key and the various strategies, please refer to the [Incremental Loading](../getting-started/incremental-loading.md) section.
|
|
41
|
+
|
|
42
|
+
## General flags
|
|
43
|
+
|
|
44
|
+
- `--help`: Displays the help message and exits the command.
|
|
45
|
+
|
|
46
|
+
## Examples
|
|
47
|
+
|
|
48
|
+
### Ingesting a CSV file to DuckDB
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
ingestr ingest \
|
|
52
|
+
--source-uri 'csv://input.csv' \
|
|
53
|
+
--source-table 'sample' \
|
|
54
|
+
--dest-uri 'duckdb://output.duckdb'
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Copy a table from Postgres to DuckDB
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
ingestr ingest \
|
|
61
|
+
--source-uri 'postgresql://myuser:mypassword@localhost:5432/mydatabase?sslmode=disable' \
|
|
62
|
+
--source-table 'public.input_table' \
|
|
63
|
+
--dest-uri 'duckdb://output.duckdb' \
|
|
64
|
+
--dest-table 'public.output_table'
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Incrementally ingest a table from Postgres to BigQuery
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
ingestr ingest
|
|
71
|
+
--source-uri 'postgresql://myuser:mypassword@localhost:5432/mydatabase?sslmode=disable' \
|
|
72
|
+
--source-table 'public.users' \
|
|
73
|
+
--dest-uri 'bigquery://my_project?credentials_path=/path/to/service/account.json&location=EU' \
|
|
74
|
+
--dest-table 'raw.users' \
|
|
75
|
+
--incremental-key 'updated_at' \
|
|
76
|
+
--incremental-strategy 'delete+insert'
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Load an interval of data from Postgres to BigQuery using a date column
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
ingestr ingest
|
|
83
|
+
--source-uri 'postgresql://myuser:mypassword@localhost:5432/mydatabase?sslmode=disable' \
|
|
84
|
+
--source-table 'public.users' \
|
|
85
|
+
--dest-uri 'bigquery://my_project?credentials_path=/path/to/service/account.json&location=EU' \
|
|
86
|
+
--dest-table 'raw.users' \
|
|
87
|
+
--incremental-key 'dt' \
|
|
88
|
+
--incremental-strategy 'delete+insert' \
|
|
89
|
+
--interval-start '2023-01-01' \
|
|
90
|
+
--interval-end '2023-01-31' \
|
|
91
|
+
--columns 'dt:date'
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Load a specific query from Postgres to Snowflake
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
ingestr ingest
|
|
98
|
+
--source-uri 'postgresql://myuser:mypassword@localhost:5432/mydatabase?sslmode=disable' \
|
|
99
|
+
--dest-uri 'snowflake://user:password@account/dbname?warehouse=COMPUTE_WH&role=my_role' \
|
|
100
|
+
--source-table 'query:SELECT * FROM public.users as pu JOIN public.orders as o ON pu.id = o.user_id WHERE pu.dt BETWEEN :interval_start AND :interval_end' \
|
|
101
|
+
--dest-table 'raw.users' \
|
|
102
|
+
--incremental-key 'dt' \
|
|
103
|
+
--incremental-strategy 'delete+insert' \
|
|
104
|
+
--interval-start '2023-01-01' \
|
|
105
|
+
--interval-end '2023-01-31' \
|
|
106
|
+
--columns 'dt:date'
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
> [!INFO]
|
|
110
|
+
> For more examples, please refer to the specific platforms' documentation on the sidebar.
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# GitHub
|
|
2
|
+
|
|
3
|
+
[GitHub](https://github.com/) is a developer platform that allows developers to create, store, manage and share their code.
|
|
4
|
+
|
|
5
|
+
ingestr supports GitHub as a source.
|
|
6
|
+
|
|
7
|
+
## URI format
|
|
8
|
+
|
|
9
|
+
The URI format for GitHub is as follows:
|
|
10
|
+
|
|
11
|
+
```plaintext
|
|
12
|
+
github://?access_token=<access_token>&owner=<owner>&repo=<repo>
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
URI parameters:
|
|
16
|
+
|
|
17
|
+
- `access_token`: Access Token used for authentication with the GitHub API
|
|
18
|
+
- `owner`: Refers to the owner of the repository
|
|
19
|
+
- `repo`: Refers to the name of the repository
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Setting up a GitHub Integration
|
|
23
|
+
|
|
24
|
+
GitHub requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/github#setup-guide).
|
|
25
|
+
|
|
26
|
+
Once you complete the guide, you should have an access token. Let's say your access token is `ghp_test_1234`, the owner is `max`, and the name of the repository is `test_example`. Here is a sample command that will copy the data from GitHub into a DuckDB database:
|
|
27
|
+
|
|
28
|
+
```sh
|
|
29
|
+
ingestr ingest --source-uri 'github://?access_token=ghp_test_1234&owner=max&repo=test_example' --source-table 'issues' --dest-uri duckdb:///github.duckdb --dest-table 'dest.issues'
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This is a sample command that will copy the data from the GitHub source to DuckDB.
|
|
33
|
+
|
|
34
|
+
<img alt="github_img" src="../media/github.png" />
|
|
35
|
+
|
|
36
|
+
## Tables
|
|
37
|
+
|
|
38
|
+
GitHub source allows ingesting the following sources into separate tables:
|
|
39
|
+
|
|
40
|
+
- `issues`: Retrieves data about issues, their associated comments, and subsequent reactions
|
|
41
|
+
- `pull_requests`: Retrieves all pull requests
|
|
42
|
+
- `repo_events`: Retrieves all the repo events associated with the repository
|
|
43
|
+
- `stargazers`: Retrieves all stargazers
|
|
44
|
+
|
|
45
|
+
Use these as `--source-table` parameter in the `ingestr ingest` command.
|
|
46
|
+
|
|
47
|
+
> [!WARNING]
|
|
48
|
+
> GitHub does not support incremental loading for many endpoints in its APIs, which means ingestr will load endpoints incrementally if they support it, and do a full-refresh if not.
|
|
49
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Google Analytics
|
|
2
|
+
[Google Analytics](https://marketingplatform.google.com/about/analytics/) is a service for web analytics that tracks and provides data regarding user engagement with your website or application.
|
|
3
|
+
|
|
4
|
+
ingestr supports Google Analytics as a source.
|
|
5
|
+
|
|
6
|
+
## URI format
|
|
7
|
+
The URI format for Google Analytics is as follows:
|
|
8
|
+
|
|
9
|
+
```plaintext
|
|
10
|
+
googleanalytics://?credentials_path=/path/to/service/account.json&property_id=<property_id>
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
URI parameters:
|
|
14
|
+
- `credentials_path`: The path to the service account JSON file.
|
|
15
|
+
- `property_id`: It is a unique number that identifies a particular property on Google Analytics. [Follow this guide](https://developers.google.com/analytics/devguides/reporting/data/v1/property-id#what_is_my_property_id) if you don't know your property ID.
|
|
16
|
+
|
|
17
|
+
## Setting up an Google Analytics Integration
|
|
18
|
+
Google Analytics requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_analytics#grab-google-service-account-credentials). Once you complete the guide, you should have an `.json` file and `project_id`.
|
|
19
|
+
|
|
20
|
+
## Table: Custom Reports
|
|
21
|
+
Custom reports allow you to retrieve data based on specific `dimensions` and `metrics`.
|
|
22
|
+
|
|
23
|
+
Custom Table Format:
|
|
24
|
+
```
|
|
25
|
+
custom:<dimensions>:<metrics>
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Parameters:
|
|
29
|
+
- `dimensions`(required): A comma-separated list of [dimensions](https://developers.google.com/analytics/devguides/reporting/data/v1/exploration-api-schema#dimensions) to retrieve.
|
|
30
|
+
- `metrics`(required): A comma-separated list of [metrics](https://developers.google.com/analytics/devguides/reporting/data/v1/exploration-api-schema#metrics) to retrieve.
|
|
31
|
+
|
|
32
|
+
### Example
|
|
33
|
+
|
|
34
|
+
```sh
|
|
35
|
+
ingestr ingest \
|
|
36
|
+
--source-uri "googleanalytics://?credentials_path="ingestr/src/g_analytics.json&property_id=id123" \
|
|
37
|
+
--source-table "custom:city,date:clicks,activeUsers,newUsers" \
|
|
38
|
+
--dest-uri "duckdb:///analytics.duckdb" \
|
|
39
|
+
--dest-table "dest.custom"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
This command will retrieve report and save it to the `dest.custom` table in the DuckDB database.
|
|
43
|
+
|
|
44
|
+
<img alt="google_analytics_img" src="../media/googleanalytics.png" />
|
|
@@ -32,7 +32,7 @@ DATE_FORMATS = [
|
|
|
32
32
|
|
|
33
33
|
# https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
|
|
34
34
|
PARQUET_SUPPORTED_DESTINATIONS = [
|
|
35
|
-
"bigquery",
|
|
35
|
+
"athena" "bigquery",
|
|
36
36
|
"duckdb",
|
|
37
37
|
"snowflake",
|
|
38
38
|
"databricks",
|
|
@@ -287,8 +287,14 @@ def ingest(
|
|
|
287
287
|
envvar="SQL_EXCLUDE_COLUMNS",
|
|
288
288
|
),
|
|
289
289
|
] = [], # type: ignore
|
|
290
|
+
columns: Annotated[
|
|
291
|
+
Optional[list[str]],
|
|
292
|
+
typer.Option(
|
|
293
|
+
help="The column types to be used for the destination table in the format of 'column_name:column_type'",
|
|
294
|
+
envvar="COLUMNS",
|
|
295
|
+
),
|
|
296
|
+
] = None, # type: ignore
|
|
290
297
|
):
|
|
291
|
-
# TODO(turtledev): can't we move this to the top of this file?
|
|
292
298
|
import hashlib
|
|
293
299
|
import tempfile
|
|
294
300
|
from datetime import datetime
|
|
@@ -296,6 +302,7 @@ def ingest(
|
|
|
296
302
|
import dlt
|
|
297
303
|
import humanize
|
|
298
304
|
import typer
|
|
305
|
+
from dlt.common.data_types import TDataType
|
|
299
306
|
from dlt.common.destination import Destination
|
|
300
307
|
from dlt.common.pipeline import LoadInfo
|
|
301
308
|
from dlt.common.runtime.collector import Collector, LogCollector
|
|
@@ -345,7 +352,7 @@ def ingest(
|
|
|
345
352
|
not in dlt_dest.capabilities().supported_loader_file_formats
|
|
346
353
|
):
|
|
347
354
|
print(
|
|
348
|
-
f"[red]Loader file format {loader_file_format.value} is not supported by the destination.[/red]"
|
|
355
|
+
f"[red]Loader file format {loader_file_format.value} is not supported by the destination, available formats: {dlt_dest.capabilities().supported_loader_file_formats}.[/red]"
|
|
349
356
|
)
|
|
350
357
|
raise typer.Abort()
|
|
351
358
|
|
|
@@ -357,6 +364,23 @@ def ingest(
|
|
|
357
364
|
else:
|
|
358
365
|
executable(source)
|
|
359
366
|
|
|
367
|
+
def parse_columns(columns: list[str]) -> dict[str, TDataType]:
|
|
368
|
+
from typing import cast, get_args
|
|
369
|
+
|
|
370
|
+
possible_types = get_args(TDataType)
|
|
371
|
+
|
|
372
|
+
types: dict[str, TDataType] = {}
|
|
373
|
+
for column in columns:
|
|
374
|
+
for candidate in column.split(","):
|
|
375
|
+
column_name, column_type = candidate.split(":")
|
|
376
|
+
if column_type not in possible_types:
|
|
377
|
+
print(
|
|
378
|
+
f"[red]Column type '{column_type}' is not supported, supported types: {possible_types}.[/red]"
|
|
379
|
+
)
|
|
380
|
+
raise typer.Abort()
|
|
381
|
+
types[column_name] = cast(TDataType, column_type)
|
|
382
|
+
return types
|
|
383
|
+
|
|
360
384
|
track(
|
|
361
385
|
"command_triggered",
|
|
362
386
|
{
|
|
@@ -399,12 +423,20 @@ def ingest(
|
|
|
399
423
|
column_hints: dict[str, TColumnSchema] = {}
|
|
400
424
|
original_incremental_strategy = incremental_strategy
|
|
401
425
|
|
|
426
|
+
if columns:
|
|
427
|
+
column_types = parse_columns(columns)
|
|
428
|
+
for column_name, column_type in column_types.items():
|
|
429
|
+
column_hints[column_name] = {"data_type": column_type}
|
|
430
|
+
|
|
402
431
|
merge_key = None
|
|
403
432
|
if incremental_strategy == IncrementalStrategy.delete_insert:
|
|
404
433
|
merge_key = incremental_key
|
|
405
434
|
incremental_strategy = IncrementalStrategy.merge
|
|
406
435
|
if incremental_key:
|
|
407
|
-
|
|
436
|
+
if incremental_key not in column_hints:
|
|
437
|
+
column_hints[incremental_key] = {}
|
|
438
|
+
|
|
439
|
+
column_hints[incremental_key]["merge_key"] = True
|
|
408
440
|
|
|
409
441
|
m = hashlib.sha256()
|
|
410
442
|
m.update(dest_table.encode("utf-8"))
|
|
@@ -491,6 +523,21 @@ def ingest(
|
|
|
491
523
|
if factory.source_scheme == "sqlite":
|
|
492
524
|
source_table = "main." + source_table.split(".")[-1]
|
|
493
525
|
|
|
526
|
+
if (
|
|
527
|
+
incremental_key
|
|
528
|
+
and incremental_key in column_hints
|
|
529
|
+
and "data_type" in column_hints[incremental_key]
|
|
530
|
+
and column_hints[incremental_key]["data_type"] == "date"
|
|
531
|
+
):
|
|
532
|
+
# By default, ingestr treats the start and end dates as datetime objects. While this worked fine for many cases, if the
|
|
533
|
+
# incremental field is a date, the start and end dates cannot be compared to the incremental field, and the ingestion would fail.
|
|
534
|
+
# In order to eliminate this, we have introduced a new option to ingestr, --columns, which allows the user to specify the column types for the destination table.
|
|
535
|
+
# This way, ingestr will know the data type of the incremental field, and will be able to convert the start and end dates to the correct data type before running the ingestion.
|
|
536
|
+
if interval_start:
|
|
537
|
+
interval_start = interval_start.date() # type: ignore
|
|
538
|
+
if interval_end:
|
|
539
|
+
interval_end = interval_end.date() # type: ignore
|
|
540
|
+
|
|
494
541
|
dlt_source = source.dlt_source(
|
|
495
542
|
uri=source_uri,
|
|
496
543
|
table=source_table,
|
|
@@ -24,6 +24,8 @@ from ingestr.src.sources import (
|
|
|
24
24
|
ChessSource,
|
|
25
25
|
DynamoDBSource,
|
|
26
26
|
FacebookAdsSource,
|
|
27
|
+
GitHubSource,
|
|
28
|
+
GoogleAnalyticsSource,
|
|
27
29
|
GoogleSheetsSource,
|
|
28
30
|
GorgiasSource,
|
|
29
31
|
HubspotSource,
|
|
@@ -102,6 +104,7 @@ class SourceDestinationFactory:
|
|
|
102
104
|
"gsheets": GoogleSheetsSource,
|
|
103
105
|
"shopify": ShopifySource,
|
|
104
106
|
"gorgias": GorgiasSource,
|
|
107
|
+
"github": GitHubSource,
|
|
105
108
|
"chess": ChessSource,
|
|
106
109
|
"stripe": StripeAnalyticsSource,
|
|
107
110
|
"facebookads": FacebookAdsSource,
|
|
@@ -118,6 +121,7 @@ class SourceDestinationFactory:
|
|
|
118
121
|
"dynamodb": DynamoDBSource,
|
|
119
122
|
"asana": AsanaSource,
|
|
120
123
|
"tiktok": TikTokSource,
|
|
124
|
+
"googleanalytics": GoogleAnalyticsSource,
|
|
121
125
|
}
|
|
122
126
|
destinations: Dict[str, Type[DestinationProtocol]] = {
|
|
123
127
|
"bigquery": BigQueryDestination,
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Source that load github issues, pull requests and reactions for a specific repository via customizable graphql query. Loads events incrementally."""
|
|
2
|
+
|
|
3
|
+
import urllib.parse
|
|
4
|
+
from typing import Iterator, Optional, Sequence
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from dlt.common.typing import TDataItems
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
|
|
10
|
+
from .helpers import get_reactions_data, get_rest_pages, get_stargazers
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dlt.source
|
|
14
|
+
def github_reactions(
|
|
15
|
+
owner: str,
|
|
16
|
+
name: str,
|
|
17
|
+
access_token: str = dlt.secrets.value,
|
|
18
|
+
items_per_page: int = 100,
|
|
19
|
+
max_items: Optional[int] = None,
|
|
20
|
+
) -> Sequence[DltResource]:
|
|
21
|
+
"""Get reactions associated with issues, pull requests and comments in the repo `name` with owner `owner`.
|
|
22
|
+
|
|
23
|
+
This source uses graphql to retrieve all issues (`issues` resource) and pull requests (`pull requests` resource) with the associated reactions (up to 100),
|
|
24
|
+
comments (up to 100) and reactions to comments (also up to 100). Internally graphql is used to retrieve data. It is cost optimized and you are able to retrieve the
|
|
25
|
+
data for fairly large repos quickly and cheaply.
|
|
26
|
+
You can and should change the queries in `queries.py` to include for example additional fields or connections. The source can be hacked to add more resources for other
|
|
27
|
+
repository nodes easily.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
owner (str): The repository owner
|
|
31
|
+
name (str): The repository name
|
|
32
|
+
access_token (str): The classic access token. Will be injected from secrets if not provided.
|
|
33
|
+
items_per_page (int, optional): How many issues/pull requests to get in single page. Defaults to 100.
|
|
34
|
+
max_items (int, optional): How many issues/pull requests to get in total. None means All.
|
|
35
|
+
max_item_age_seconds (float, optional): Do not get items older than this. Defaults to None. NOT IMPLEMENTED
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Sequence[DltResource]: Two DltResources: `issues` with issues and `pull_requests` with pull requests
|
|
39
|
+
"""
|
|
40
|
+
return (
|
|
41
|
+
dlt.resource(
|
|
42
|
+
get_reactions_data(
|
|
43
|
+
"issues",
|
|
44
|
+
owner,
|
|
45
|
+
name,
|
|
46
|
+
access_token,
|
|
47
|
+
items_per_page,
|
|
48
|
+
max_items,
|
|
49
|
+
),
|
|
50
|
+
name="issues",
|
|
51
|
+
write_disposition="replace",
|
|
52
|
+
),
|
|
53
|
+
dlt.resource(
|
|
54
|
+
get_reactions_data(
|
|
55
|
+
"pullRequests",
|
|
56
|
+
owner,
|
|
57
|
+
name,
|
|
58
|
+
access_token,
|
|
59
|
+
items_per_page,
|
|
60
|
+
max_items,
|
|
61
|
+
),
|
|
62
|
+
name="pull_requests",
|
|
63
|
+
write_disposition="replace",
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dlt.source(max_table_nesting=0)
|
|
69
|
+
def github_repo_events(
|
|
70
|
+
owner: str, name: str, access_token: Optional[str] = None
|
|
71
|
+
) -> DltResource:
|
|
72
|
+
"""Gets events for repository `name` with owner `owner` incrementally.
|
|
73
|
+
|
|
74
|
+
This source contains a single resource `repo_events` that gets given repository's events and dispatches them to separate tables with names based on event type.
|
|
75
|
+
The data is loaded incrementally. Subsequent runs will get only new events and append them to tables.
|
|
76
|
+
Please note that Github allows only for 300 events to be retrieved for public repositories. You should get the events frequently for the active repos.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
owner (str): The repository owner
|
|
80
|
+
name (str): The repository name
|
|
81
|
+
access_token (str): The classic or fine-grained access token. If not provided, calls are made anonymously
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
DltSource: source with the `repo_events` resource
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
# use naming function in table name to generate separate tables for each event
|
|
89
|
+
@dlt.resource(primary_key="id", table_name=lambda i: i["type"])
|
|
90
|
+
def repo_events(
|
|
91
|
+
last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
92
|
+
"created_at", initial_value="1970-01-01T00:00:00Z", last_value_func=max
|
|
93
|
+
),
|
|
94
|
+
) -> Iterator[TDataItems]:
|
|
95
|
+
repos_path = (
|
|
96
|
+
f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
|
|
100
|
+
yield page
|
|
101
|
+
|
|
102
|
+
# stop requesting pages if the last element was already older than initial value
|
|
103
|
+
# note: incremental will skip those items anyway, we just do not want to use the api limits
|
|
104
|
+
if last_created_at.start_out_of_range:
|
|
105
|
+
print(
|
|
106
|
+
f"Overlap with previous run created at {last_created_at.initial_value}"
|
|
107
|
+
)
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
return repo_events
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dlt.source
|
|
114
|
+
def github_stargazers(
|
|
115
|
+
owner: str,
|
|
116
|
+
name: str,
|
|
117
|
+
access_token: str = dlt.secrets.value,
|
|
118
|
+
items_per_page: int = 100,
|
|
119
|
+
max_items: Optional[int] = None,
|
|
120
|
+
) -> Sequence[DltResource]:
|
|
121
|
+
"""Get stargazers in the repo `name` with owner `owner`.
|
|
122
|
+
|
|
123
|
+
This source uses graphql to retrieve all stargazers with the associated starred date,
|
|
124
|
+
Internally graphql is used to retrieve data. It is cost optimized and you are able to retrieve the
|
|
125
|
+
data for fairly large repos quickly and cheaply.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
owner (str): The repository owner
|
|
129
|
+
name (str): The repository name
|
|
130
|
+
access_token (str): The classic access token. Will be injected from secrets if not provided.
|
|
131
|
+
items_per_page (int, optional): How many issues/pull requests to get in single page. Defaults to 100.
|
|
132
|
+
max_items (int, optional): How many issues/pull requests to get in total. None means All.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Sequence[DltResource]: One DltResource: `stargazers`
|
|
136
|
+
"""
|
|
137
|
+
return (
|
|
138
|
+
dlt.resource(
|
|
139
|
+
get_stargazers(
|
|
140
|
+
owner,
|
|
141
|
+
name,
|
|
142
|
+
access_token,
|
|
143
|
+
items_per_page,
|
|
144
|
+
max_items,
|
|
145
|
+
),
|
|
146
|
+
name="stargazers",
|
|
147
|
+
write_disposition="replace",
|
|
148
|
+
),
|
|
149
|
+
)
|