ingestr 0.9.1__tar.gz → 0.9.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (136) hide show
  1. {ingestr-0.9.1 → ingestr-0.9.3}/.gitignore +1 -0
  2. {ingestr-0.9.1 → ingestr-0.9.3}/Dockerfile +1 -1
  3. {ingestr-0.9.1 → ingestr-0.9.3}/PKG-INFO +3 -2
  4. ingestr-0.9.3/docs/supported-sources/adjust.md +84 -0
  5. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/kafka.md +3 -3
  6. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/mssql.md +8 -0
  7. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/main.py +3 -1
  8. ingestr-0.9.3/ingestr/src/adjust/__init__.py +100 -0
  9. ingestr-0.9.1/ingestr/src/adjust/helpers.py → ingestr-0.9.3/ingestr/src/adjust/adjust_helpers.py +46 -23
  10. ingestr-0.9.3/ingestr/src/arrow/__init__.py +77 -0
  11. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/factory.py +3 -0
  12. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/mongodb/__init__.py +1 -1
  13. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/mongodb/helpers.py +1 -1
  14. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/sources.py +109 -23
  15. ingestr-0.9.3/ingestr/src/version.py +1 -0
  16. {ingestr-0.9.1 → ingestr-0.9.3}/pyproject.toml +3 -0
  17. {ingestr-0.9.1 → ingestr-0.9.3}/requirements.txt +0 -1
  18. ingestr-0.9.1/docs/supported-sources/adjust.md +0 -30
  19. ingestr-0.9.1/ingestr/src/adjust/_init_.py +0 -31
  20. ingestr-0.9.1/ingestr/src/version.py +0 -1
  21. {ingestr-0.9.1 → ingestr-0.9.3}/.dockerignore +0 -0
  22. {ingestr-0.9.1 → ingestr-0.9.3}/.github/workflows/deploy-docs.yml +0 -0
  23. {ingestr-0.9.1 → ingestr-0.9.3}/.github/workflows/tests.yml +0 -0
  24. {ingestr-0.9.1 → ingestr-0.9.3}/.python-version +0 -0
  25. {ingestr-0.9.1 → ingestr-0.9.3}/LICENSE.md +0 -0
  26. {ingestr-0.9.1 → ingestr-0.9.3}/Makefile +0 -0
  27. {ingestr-0.9.1 → ingestr-0.9.3}/README.md +0 -0
  28. {ingestr-0.9.1 → ingestr-0.9.3}/docs/.vitepress/config.mjs +0 -0
  29. {ingestr-0.9.1 → ingestr-0.9.3}/docs/.vitepress/theme/custom.css +0 -0
  30. {ingestr-0.9.1 → ingestr-0.9.3}/docs/.vitepress/theme/index.js +0 -0
  31. {ingestr-0.9.1 → ingestr-0.9.3}/docs/commands/example-uris.md +0 -0
  32. {ingestr-0.9.1 → ingestr-0.9.3}/docs/commands/ingest.md +0 -0
  33. {ingestr-0.9.1 → ingestr-0.9.3}/docs/getting-started/core-concepts.md +0 -0
  34. {ingestr-0.9.1 → ingestr-0.9.3}/docs/getting-started/incremental-loading.md +0 -0
  35. {ingestr-0.9.1 → ingestr-0.9.3}/docs/getting-started/quickstart.md +0 -0
  36. {ingestr-0.9.1 → ingestr-0.9.3}/docs/getting-started/telemetry.md +0 -0
  37. {ingestr-0.9.1 → ingestr-0.9.3}/docs/index.md +0 -0
  38. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/airtable.md +0 -0
  39. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/appsflyer.md +0 -0
  40. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/bigquery.md +0 -0
  41. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/chess.md +0 -0
  42. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/csv.md +0 -0
  43. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/databricks.md +0 -0
  44. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/duckdb.md +0 -0
  45. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/facebook-ads.md +0 -0
  46. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/gorgias.md +0 -0
  47. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/gsheets.md +0 -0
  48. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/hubspot.md +0 -0
  49. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/klaviyo.md +0 -0
  50. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/mongodb.md +0 -0
  51. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/mysql.md +0 -0
  52. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/notion.md +0 -0
  53. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/oracle.md +0 -0
  54. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/postgres.md +0 -0
  55. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/redshift.md +0 -0
  56. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/s3.md +0 -0
  57. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/sap-hana.md +0 -0
  58. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/shopify.md +0 -0
  59. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/slack.md +0 -0
  60. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/snowflake.md +0 -0
  61. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/sqlite.md +0 -0
  62. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/stripe.md +0 -0
  63. {ingestr-0.9.1 → ingestr-0.9.3}/docs/supported-sources/zendesk.md +0 -0
  64. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/.gitignore +0 -0
  65. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/airtable/__init__.py +0 -0
  66. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/appsflyer/_init_.py +0 -0
  67. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/appsflyer/client.py +0 -0
  68. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/chess/__init__.py +0 -0
  69. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/chess/helpers.py +0 -0
  70. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/chess/settings.py +0 -0
  71. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/destinations.py +0 -0
  72. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/facebook_ads/__init__.py +0 -0
  73. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/facebook_ads/exceptions.py +0 -0
  74. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/facebook_ads/helpers.py +0 -0
  75. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/facebook_ads/settings.py +0 -0
  76. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/filesystem/__init__.py +0 -0
  77. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/filesystem/helpers.py +0 -0
  78. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/filesystem/readers.py +0 -0
  79. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/google_sheets/README.md +0 -0
  80. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/google_sheets/__init__.py +0 -0
  81. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/google_sheets/helpers/__init__.py +0 -0
  82. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/google_sheets/helpers/api_calls.py +0 -0
  83. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/google_sheets/helpers/data_processing.py +0 -0
  84. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/gorgias/__init__.py +0 -0
  85. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/gorgias/helpers.py +0 -0
  86. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/hubspot/__init__.py +0 -0
  87. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/hubspot/helpers.py +0 -0
  88. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/hubspot/settings.py +0 -0
  89. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/kafka/__init__.py +0 -0
  90. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/kafka/helpers.py +0 -0
  91. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/klaviyo/_init_.py +0 -0
  92. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/klaviyo/client.py +0 -0
  93. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/klaviyo/helpers.py +0 -0
  94. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/notion/__init__.py +0 -0
  95. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/notion/helpers/__init__.py +0 -0
  96. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/notion/helpers/client.py +0 -0
  97. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/notion/helpers/database.py +0 -0
  98. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/notion/settings.py +0 -0
  99. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/shopify/__init__.py +0 -0
  100. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/shopify/exceptions.py +0 -0
  101. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/shopify/helpers.py +0 -0
  102. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/shopify/settings.py +0 -0
  103. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/slack/__init__.py +0 -0
  104. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/slack/helpers.py +0 -0
  105. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/slack/settings.py +0 -0
  106. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/sql_database/__init__.py +0 -0
  107. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/sql_database/arrow_helpers.py +0 -0
  108. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/sql_database/helpers.py +0 -0
  109. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/sql_database/override.py +0 -0
  110. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/sql_database/schema_types.py +0 -0
  111. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/stripe_analytics/__init__.py +0 -0
  112. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/stripe_analytics/helpers.py +0 -0
  113. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/stripe_analytics/settings.py +0 -0
  114. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/table_definition.py +0 -0
  115. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/telemetry/event.py +0 -0
  116. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/testdata/fakebqcredentials.json +0 -0
  117. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/zendesk/__init__.py +0 -0
  118. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/zendesk/helpers/__init__.py +0 -0
  119. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/zendesk/helpers/api_helpers.py +0 -0
  120. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/zendesk/helpers/credentials.py +0 -0
  121. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/zendesk/helpers/talk_api.py +0 -0
  122. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/src/zendesk/settings.py +0 -0
  123. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/.gitignore +0 -0
  124. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/create_replace.csv +0 -0
  125. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/delete_insert_expected.csv +0 -0
  126. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/delete_insert_part1.csv +0 -0
  127. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/delete_insert_part2.csv +0 -0
  128. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/merge_expected.csv +0 -0
  129. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/merge_part1.csv +0 -0
  130. {ingestr-0.9.1 → ingestr-0.9.3}/ingestr/testdata/merge_part2.csv +0 -0
  131. {ingestr-0.9.1 → ingestr-0.9.3}/package-lock.json +0 -0
  132. {ingestr-0.9.1 → ingestr-0.9.3}/package.json +0 -0
  133. {ingestr-0.9.1 → ingestr-0.9.3}/requirements-dev.txt +0 -0
  134. {ingestr-0.9.1 → ingestr-0.9.3}/resources/demo.gif +0 -0
  135. {ingestr-0.9.1 → ingestr-0.9.3}/resources/demo.tape +0 -0
  136. {ingestr-0.9.1 → ingestr-0.9.3}/resources/ingestr.svg +0 -0
@@ -15,3 +15,4 @@ docs/.vitepress/dist
15
15
  docs/.vitepress/cache
16
16
  node_modules
17
17
  *.duckdb
18
+ *.db
@@ -28,7 +28,7 @@ ENV VIRTUAL_ENV=/usr/local
28
28
  ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
29
29
  RUN /install.sh && rm /install.sh
30
30
 
31
- RUN /root/.cargo/bin/uv pip install --system --no-cache -r requirements.txt
31
+ RUN $HOME/.local/bin/uv pip install --system --no-cache -r requirements.txt
32
32
 
33
33
  COPY . /app
34
34
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ingestr
3
- Version: 0.9.1
3
+ Version: 0.9.3
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -28,7 +28,6 @@ Requires-Dist: py-machineid==0.5.1
28
28
  Requires-Dist: pyairtable==2.3.3
29
29
  Requires-Dist: pymongo==4.6.3
30
30
  Requires-Dist: pymysql==1.1.0
31
- Requires-Dist: pyodbc==5.1.0
32
31
  Requires-Dist: pyrate-limiter==3.6.1
33
32
  Requires-Dist: redshift-connector==2.1.0
34
33
  Requires-Dist: rich==13.7.1
@@ -44,6 +43,8 @@ Requires-Dist: stripe==10.7.0
44
43
  Requires-Dist: tqdm==4.66.2
45
44
  Requires-Dist: typer==0.12.3
46
45
  Requires-Dist: types-requests==2.32.0.20240907
46
+ Provides-Extra: odbc
47
+ Requires-Dist: pyodbc==5.1.0; extra == 'odbc'
47
48
  Provides-Extra: oracle
48
49
  Requires-Dist: cx-oracle==8.3.0; extra == 'oracle'
49
50
  Description-Content-Type: text/markdown
@@ -0,0 +1,84 @@
1
+ # Adjust
2
+
3
+ [Adjust](https://www.adjust.com/) is a mobile marketing analytics platform that provides solutions for measuring and optimizing campaigns, as well as protecting user data.
4
+
5
+ ingestr supports Adjust as a source.
6
+
7
+ ## URI Format
8
+
9
+ The URI format for Adjust is as follows:
10
+
11
+ ```plaintext
12
+ adjust://?api_key=<api-key-here>
13
+ ```
14
+ Parameters:
15
+ - `api_key`: Required. The API key for the Adjust account.
16
+ - `lookback_days`: Optional. The number of days to go back than the given start date for data. Defaults to 30 days.
17
+
18
+ An API token is required to retrieve reports from the Adjust reporting API. please follow the guide to [obtain an API key](https://dev.adjust.com/en/api/rs-api/authentication/).
19
+
20
+ Once you complete the guide, you should have an API key. Let's say your API key is `nr_123`, here's a sample command that will copy the data from Adjust into a DuckDB database:
21
+
22
+ ```sh
23
+ ingestr ingest --source-uri 'adjust://?api_key=nr_123' --source-table 'campaigns' --dest-uri duckdb:///adjust.duckdb --dest-table 'adjust.output'
24
+ ```
25
+
26
+ The result of this command will be a table in the `adjust.duckdb` database.
27
+
28
+ ### Lookback Days
29
+
30
+ Adjust data may change going back, which means you'll need to change your start date to get the latest data. The `lookback_days` parameter allows you to specify how many days to go back when calculating the start date, and takes care of automatically updating the start date and getting the past data as well. It defaults to 30 days.
31
+
32
+ ## Tables
33
+ Adjust source allows ingesting data from various sources:
34
+
35
+ - `campaigns`: Retrieves data for a campaign, showing the app's revenue and network costs over multiple days.
36
+ - `creatives`: Retrieves data for a creative assest, detailing the app's revenue and network costs across multiple days.
37
+ - `custom`: Retrieves custom data based on the dimensions and metrics specified.
38
+
39
+ ### Custom Table: `custom:<dimensions>:<metrics>[:<filters>]`
40
+
41
+ The custom table allows you to retrieve data based on specific dimensions and metrics, and apply filters to the data.
42
+
43
+ The format for the custom table is:
44
+ ```plaintext
45
+ custom:<dimensions>:<metrics>[:<filters>]
46
+ ```
47
+
48
+ Parameters:
49
+ - `dimensions`: A comma-separated list of [dimensions](https://dev.adjust.com/en/api/rs-api/reports#dimensions) to retrieve.
50
+ - `metrics`: A comma-separated list of [metrics](https://dev.adjust.com/en/api/rs-api/reports#metrics) to retrieve.
51
+ - `filters`: A comma-separated list of [filters](https://dev.adjust.com/en/api/rs-api/reports#filters) to apply to the data.
52
+ - Parsing the `filters` key is smart enough to handle filters that contain commas inside them.
53
+
54
+ > [!WARNING]
55
+ > Custom tables require a time-based dimension for efficient operation, such as `hour`, `day`, `week`, `month`, or `year`.
56
+
57
+ ## Examples
58
+
59
+ Copy campaigns data from Adjust into a DuckDB database:
60
+ ```sh
61
+ ingestr ingest \
62
+ --source-uri 'adjust://?api_key=nr_123' \
63
+ --source-table 'campaigns' \
64
+ --dest-uri duckdb:///adjust.duckdb \
65
+ --dest-table 'adjust.output'
66
+ ```
67
+
68
+ Copy creatives data from Adjust into a DuckDB database:
69
+ ```sh
70
+ ingestr ingest \
71
+ --source-uri 'adjust://?api_key=nr_123' \
72
+ --source-table 'creatives' \
73
+ --dest-uri duckdb:///adjust.duckdb \
74
+ --dest-table 'adjust.output'
75
+ ```
76
+
77
+ Copy custom data from Adjust into a DuckDB database:
78
+ ```sh
79
+ ingestr ingest \
80
+ --source-uri "adjust://?api_key=nr_123&lookback_days=2" \
81
+ --source-table "custom:hour,app,store_id,channel,os_name,country_code,campaign_network,campaign_id_network,adgroup_network, adgroup_id_network,creative_network,creative_id_network:impressions,clicks,cost,network_cost,installs,ad_revenue,all_revenue" \
82
+ --dest-uri duckdb:///adjust.db \
83
+ --dest-table "mat.example"
84
+ ```
@@ -11,8 +11,8 @@ kafka://?bootstrap_servers=localhost:9092&group_id=test_group&security_protocol=
11
11
  ```
12
12
 
13
13
  URI parameters:
14
- - `bootstrap_servers`: The Kafka server(s) to connect to, typically in the form of a host and port (e.g., `localhost:9092`).
15
- - `group_id`: The consumer group ID used for identifying the client when consuming messages.
14
+ - `bootstrap_servers`(required): The Kafka server(s) to connect to, typically in the form of a host and port (e.g., `localhost:9092`).
15
+ - `group_id`(required): The consumer group ID used for identifying the client when consuming messages.
16
16
  - `security_protocol`: The protocol used to communicate with brokers (e.g., `SASL_SSL` for secure communication).
17
17
  - `sasl_mechanisms`: The SASL mechanism to be used for authentication (e.g., `PLAIN`).
18
18
  - `sasl_username`: The username for SASL authentication.
@@ -29,7 +29,7 @@ Once you have your Kafka server, credentials, and group ID set up, here's a samp
29
29
 
30
30
  ```sh
31
31
  ingestr ingest \
32
- --source-uri 'kafka://?bootstrap_servers=localhost:9092' \
32
+ --source-uri 'kafka://?bootstrap_servers=localhost:9092&group_id=test_group' \
33
33
  --source-table 'my-topic' \
34
34
  --dest-uri duckdb:///kafka.duckdb \
35
35
  --dest-table 'kafka.my_topic'
@@ -3,6 +3,14 @@ Microsoft SQL Server is a relational database management system developed by Mic
3
3
 
4
4
  ingestr supports Microsoft SQL Server as both a source and destination.
5
5
 
6
+ ## Installation
7
+
8
+ To use Microsoft SQL Server with ingestr, you need to install the `pyodbc` add-on as well. You can do this by running:
9
+
10
+ ```bash
11
+ pip install ingestr[odbc]
12
+ ```
13
+
6
14
  ## URI Format
7
15
  The URI format for Microsoft SQL Server is as follows:
8
16
 
@@ -323,10 +323,12 @@ def ingest(
323
323
  else "Platform-specific"
324
324
  )
325
325
 
326
+ source_table_print = source_table.split(":")[0]
327
+
326
328
  print()
327
329
  print("[bold green]Initiated the pipeline with the following:[/bold green]")
328
330
  print(
329
- f"[bold yellow] Source:[/bold yellow] {factory.source_scheme} / {source_table}"
331
+ f"[bold yellow] Source:[/bold yellow] {factory.source_scheme} / {source_table_print}"
330
332
  )
331
333
  print(
332
334
  f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
@@ -0,0 +1,100 @@
1
+ from typing import Optional, Sequence
2
+
3
+ import dlt
4
+ import pendulum
5
+ from dlt.sources import DltResource
6
+
7
+ from .adjust_helpers import DEFAULT_DIMENSIONS, DEFAULT_METRICS, AdjustAPI
8
+
9
+ REQUIRED_CUSTOM_DIMENSIONS = [
10
+ "hour",
11
+ "day",
12
+ "week",
13
+ "month",
14
+ "quarter",
15
+ "year",
16
+ ]
17
+ KNOWN_TYPE_HINTS = {
18
+ "hour": {"data_type": "timestamp"},
19
+ "day": {"data_type": "date"},
20
+ "week": {"data_type": "text"},
21
+ "month": {"data_type": "text"},
22
+ "quarter": {"data_type": "text"},
23
+ "year": {"data_type": "text"},
24
+ "campaign": {"data_type": "text"},
25
+ "adgroup": {"data_type": "text"},
26
+ "creative": {"data_type": "text"},
27
+ # metrics
28
+ "installs": {"data_type": "bigint"},
29
+ "clicks": {"data_type": "bigint"},
30
+ "cost": {"data_type": "decimal"},
31
+ "network_cost": {"data_type": "decimal"},
32
+ "impressions": {"data_type": "bigint"},
33
+ "ad_revenue": {"data_type": "decimal"},
34
+ "all_revenue": {"data_type": "decimal"},
35
+ }
36
+
37
+
38
+ @dlt.source(max_table_nesting=0)
39
+ def adjust_source(
40
+ start_date: pendulum.DateTime,
41
+ end_date: pendulum.DateTime,
42
+ api_key: str,
43
+ dimensions: Optional[list[str]] = None,
44
+ metrics: Optional[list[str]] = None,
45
+ merge_key: Optional[str] = None,
46
+ filters: Optional[dict] = None,
47
+ ) -> Sequence[DltResource]:
48
+ @dlt.resource(write_disposition="merge", merge_key="day")
49
+ def campaigns():
50
+ adjust_api = AdjustAPI(api_key=api_key)
51
+ yield from adjust_api.fetch_report_data(
52
+ start_date=start_date,
53
+ end_date=end_date,
54
+ dimensions=DEFAULT_DIMENSIONS,
55
+ metrics=DEFAULT_METRICS,
56
+ filters=filters,
57
+ )
58
+
59
+ @dlt.resource(write_disposition="merge", merge_key="day")
60
+ def creatives():
61
+ adjust_api = AdjustAPI(api_key=api_key)
62
+ yield from adjust_api.fetch_report_data(
63
+ start_date=start_date,
64
+ end_date=end_date,
65
+ dimensions=DEFAULT_DIMENSIONS + ["adgroup", "creative"],
66
+ metrics=DEFAULT_METRICS,
67
+ filters=filters,
68
+ )
69
+
70
+ merge_key = merge_key
71
+ for dimension in REQUIRED_CUSTOM_DIMENSIONS:
72
+ if dimension in dimensions:
73
+ merge_key = dimension
74
+ break
75
+
76
+ type_hints = {}
77
+ for dimension in dimensions:
78
+ if dimension in KNOWN_TYPE_HINTS:
79
+ type_hints[dimension] = KNOWN_TYPE_HINTS[dimension]
80
+ for metric in metrics:
81
+ if metric in KNOWN_TYPE_HINTS:
82
+ type_hints[metric] = KNOWN_TYPE_HINTS[metric]
83
+
84
+ @dlt.resource(
85
+ write_disposition={"disposition": "merge", "strategy": "delete+insert"},
86
+ merge_key=merge_key,
87
+ primary_key=dimensions,
88
+ columns=type_hints,
89
+ )
90
+ def custom():
91
+ adjust_api = AdjustAPI(api_key=api_key)
92
+ yield from adjust_api.fetch_report_data(
93
+ start_date=start_date,
94
+ end_date=end_date,
95
+ dimensions=dimensions,
96
+ metrics=metrics,
97
+ filters=filters,
98
+ )
99
+
100
+ return campaigns, creatives, custom
@@ -1,3 +1,6 @@
1
+ from typing import Optional
2
+
3
+ import pendulum
1
4
  import requests
2
5
  from dlt.sources.helpers.requests import Client
3
6
  from requests.exceptions import HTTPError
@@ -32,33 +35,32 @@ class AdjustAPI:
32
35
 
33
36
  def fetch_report_data(
34
37
  self,
35
- start_date,
36
- end_date,
38
+ start_date: pendulum.DateTime,
39
+ end_date: pendulum.DateTime,
37
40
  dimensions=DEFAULT_DIMENSIONS,
38
41
  metrics=DEFAULT_METRICS,
39
- utc_offset="+00:00",
40
- ad_spend_mode="network",
41
- attribution_source="first",
42
- attribution_type="all",
43
- cohort_maturity="immature",
44
- reattributed="all",
45
- sandbox="false",
42
+ filters: Optional[dict] = None,
46
43
  ):
47
44
  headers = {"Authorization": f"Bearer {self.api_key}"}
48
- comma_separated_dimensions = ",".join(dimensions)
49
- comma_separated_metrics = ",".join(metrics)
50
- params = {
51
- "date_period": f"{start_date}:{end_date}",
52
- "dimensions": comma_separated_dimensions,
53
- "metrics": comma_separated_metrics,
54
- "utc_offset": utc_offset,
55
- "ad_spend_mode": ad_spend_mode,
56
- "attribution_source": attribution_source,
57
- "attribution_type": attribution_type,
58
- "cohort_maturity": cohort_maturity,
59
- "reattributed": reattributed,
60
- "sandbox": sandbox,
61
- }
45
+ params = {}
46
+
47
+ if filters:
48
+ for key, value in filters.items():
49
+ if isinstance(value, list):
50
+ params[key] = ",".join(value)
51
+ else:
52
+ params[key] = value
53
+
54
+ params["date_period"] = (
55
+ f"{start_date.format('YYYY-MM-DD')}:{end_date.format('YYYY-MM-DD')}"
56
+ )
57
+ params["dimensions"] = ",".join(dimensions)
58
+ params["metrics"] = ",".join(metrics)
59
+
60
+ if start_date > end_date:
61
+ raise ValueError(
62
+ f"Invalid date range: Start date ({start_date}) must be earlier than end date ({end_date})."
63
+ )
62
64
 
63
65
  def retry_on_limit(
64
66
  response: requests.Response, exception: BaseException
@@ -80,3 +82,24 @@ class AdjustAPI:
80
82
  yield items
81
83
  else:
82
84
  raise HTTPError(f"Request failed with status code: {response.status_code}")
85
+
86
+
87
+ def parse_filters(filters_raw: str) -> dict:
88
+ # Parse filter string like "key1=value1,key2=value2,value3,value4"
89
+ filters = {}
90
+ current_key = None
91
+
92
+ for item in filters_raw.split(","):
93
+ if "=" in item:
94
+ # Start of a new key-value pair
95
+ key, value = item.split("=")
96
+ filters[key] = [value] # Always start with a list
97
+ current_key = key
98
+ elif current_key is not None:
99
+ # Additional value for the current key
100
+ filters[current_key].append(item)
101
+
102
+ # Convert single-item lists to simple values
103
+ filters = {k: v[0] if len(v) == 1 else v for k, v in filters.items()}
104
+
105
+ return filters
@@ -0,0 +1,77 @@
1
+ """Source that loads tables form Airtable.
2
+ Supports whitelisting of tables or loading of all tables from a specified base.
3
+ """
4
+
5
+ from typing import Any, Optional
6
+
7
+ import dlt
8
+ from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns
9
+ from dlt.extract.items import TTableHintTemplate
10
+
11
+
12
+ def memory_mapped_arrow(
13
+ path: str,
14
+ columns: Optional[TTableSchemaColumns] = None,
15
+ primary_key: Optional[TTableHintTemplate[TColumnNames]] = None,
16
+ merge_key: Optional[TTableHintTemplate[TColumnNames]] = None,
17
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
18
+ ):
19
+ @dlt.resource(
20
+ name="arrow_mmap",
21
+ columns=columns, # type: ignore
22
+ primary_key=primary_key, # type: ignore
23
+ merge_key=merge_key, # type: ignore
24
+ )
25
+ def arrow_mmap(
26
+ incremental: Optional[dlt.sources.incremental[Any]] = incremental,
27
+ ):
28
+ import pyarrow as pa # type: ignore
29
+ import pyarrow.ipc as ipc # type: ignore
30
+
31
+ with pa.memory_map(path, "rb") as mmap:
32
+ reader: ipc.RecordBatchFileReader = ipc.open_file(mmap)
33
+ table = reader.read_all()
34
+
35
+ last_value = None
36
+ end_value = None
37
+ if incremental:
38
+ if incremental.cursor_path not in table.column_names:
39
+ raise KeyError(
40
+ f"Cursor column '{incremental.cursor_path}' does not exist in table"
41
+ )
42
+
43
+ last_value = incremental.last_value
44
+ end_value = incremental.end_value
45
+
46
+ if last_value is not None:
47
+ # Check if the column is a date type
48
+ if pa.types.is_temporal(table.schema.field(incremental.cursor_path).type): # type: ignore
49
+ if not isinstance(last_value, pa.TimestampScalar):
50
+ last_value = pa.scalar(last_value, type=pa.timestamp("ns"))
51
+
52
+ table = table.filter(
53
+ pa.compute.field(incremental.cursor_path) > last_value # type: ignore
54
+ )
55
+ else:
56
+ # For non-date types, use direct comparison
57
+ table = table.filter(
58
+ pa.compute.field(incremental.cursor_path) > last_value # type: ignore
59
+ )
60
+
61
+ if end_value is not None:
62
+ if pa.types.is_timestamp(table.schema.field(incremental.cursor_path).type): # type: ignore
63
+ # Convert end_value to timestamp if it's not already
64
+ if not isinstance(end_value, pa.TimestampScalar):
65
+ end_value = pa.scalar(end_value, type=pa.timestamp("ns"))
66
+ table = table.filter(
67
+ pa.compute.field(incremental.cursor_path) < end_value # type: ignore
68
+ )
69
+ else:
70
+ # For non-date types, use direct comparison
71
+ table = table.filter(
72
+ pa.compute.field(incremental.cursor_path) < end_value # type: ignore
73
+ )
74
+
75
+ yield table
76
+
77
+ return arrow_mmap
@@ -18,6 +18,7 @@ from ingestr.src.sources import (
18
18
  AdjustSource,
19
19
  AirtableSource,
20
20
  AppsflyerSource,
21
+ ArrowMemoryMappedSource,
21
22
  ChessSource,
22
23
  FacebookAdsSource,
23
24
  GoogleSheetsSource,
@@ -136,6 +137,8 @@ class SourceDestinationFactory:
136
137
  return AdjustSource()
137
138
  elif self.source_scheme == "zendesk":
138
139
  return ZendeskSource()
140
+ elif self.source_scheme == "mmap":
141
+ return ArrowMemoryMappedSource()
139
142
  elif self.source_scheme == "s3":
140
143
  return S3Source()
141
144
  else:
@@ -65,7 +65,7 @@ def mongodb(
65
65
  sections=("sources", "mongodb"), spec=MongoDbCollectionResourceConfiguration
66
66
  )
67
67
  def mongodb_collection(
68
- connection_url: str = dlt.secrets.value,
68
+ connection_url: str = dlt.config.value,
69
69
  database: Optional[str] = dlt.config.value,
70
70
  collection: str = dlt.config.value,
71
71
  incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
@@ -155,7 +155,7 @@ class MongoDbCollectionConfiguration(BaseConfiguration):
155
155
 
156
156
  @configspec
157
157
  class MongoDbCollectionResourceConfiguration(BaseConfiguration):
158
- connection_url: str = dlt.secrets.value
158
+ connection_url: str = dlt.config.value
159
159
  database: Optional[str] = dlt.config.value
160
160
  collection: str = dlt.config.value
161
161
  incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
@@ -1,17 +1,21 @@
1
1
  import base64
2
2
  import csv
3
3
  import json
4
- from datetime import date, datetime
4
+ from datetime import date
5
5
  from typing import Any, Callable, Optional
6
6
  from urllib.parse import parse_qs, urlparse
7
7
 
8
8
  import dlt
9
+ import pendulum
9
10
  from dlt.common.configuration.specs import AwsCredentials
11
+ from dlt.common.time import ensure_pendulum_datetime
10
12
  from dlt.common.typing import TSecretStrValue
11
13
 
12
- from ingestr.src.adjust._init_ import adjust_source
14
+ from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
15
+ from ingestr.src.adjust.adjust_helpers import parse_filters
13
16
  from ingestr.src.airtable import airtable_source
14
17
  from ingestr.src.appsflyer._init_ import appsflyer_source
18
+ from ingestr.src.arrow import memory_mapped_arrow
15
19
  from ingestr.src.chess import source
16
20
  from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
17
21
  from ingestr.src.filesystem import readers
@@ -75,6 +79,51 @@ class SqlSource:
75
79
  return table_instance
76
80
 
77
81
 
82
+ class ArrowMemoryMappedSource:
83
+ table_builder: Callable
84
+
85
+ def __init__(self, table_builder=memory_mapped_arrow) -> None:
86
+ self.table_builder = table_builder
87
+
88
+ def handles_incrementality(self) -> bool:
89
+ return False
90
+
91
+ def dlt_source(self, uri: str, table: str, **kwargs):
92
+ import os
93
+
94
+ incremental = None
95
+ if kwargs.get("incremental_key"):
96
+ start_value = kwargs.get("interval_start")
97
+ end_value = kwargs.get("interval_end")
98
+
99
+ incremental = dlt.sources.incremental(
100
+ kwargs.get("incremental_key", ""),
101
+ initial_value=start_value,
102
+ end_value=end_value,
103
+ )
104
+
105
+ file_path = uri.split("://")[1]
106
+ if not os.path.exists(file_path):
107
+ raise ValueError(f"File at path {file_path} does not exist")
108
+
109
+ if os.path.isdir(file_path):
110
+ raise ValueError(
111
+ f"Path {file_path} is a directory, it should be an Arrow memory mapped file"
112
+ )
113
+
114
+ primary_key = kwargs.get("primary_key")
115
+ merge_key = kwargs.get("merge_key")
116
+
117
+ table_instance = self.table_builder(
118
+ path=file_path,
119
+ incremental=incremental,
120
+ merge_key=merge_key,
121
+ primary_key=primary_key,
122
+ )
123
+
124
+ return table_instance
125
+
126
+
78
127
  class MongoDbSource:
79
128
  table_builder: Callable
80
129
 
@@ -656,12 +705,12 @@ class KafkaSource:
656
705
  credentials=KafkaCredentials(
657
706
  bootstrap_servers=bootstrap_servers[0],
658
707
  group_id=group_id[0],
659
- security_protocol=security_protocol[0]
660
- if len(security_protocol) > 0
661
- else None, # type: ignore
662
- sasl_mechanisms=sasl_mechanisms[0]
663
- if len(sasl_mechanisms) > 0
664
- else None, # type: ignore
708
+ security_protocol=(
709
+ security_protocol[0] if len(security_protocol) > 0 else None
710
+ ), # type: ignore
711
+ sasl_mechanisms=(
712
+ sasl_mechanisms[0] if len(sasl_mechanisms) > 0 else None
713
+ ), # type: ignore
665
714
  sasl_username=sasl_username[0] if len(sasl_username) > 0 else None, # type: ignore
666
715
  sasl_password=sasl_password[0] if len(sasl_password) > 0 else None, # type: ignore
667
716
  ),
@@ -673,10 +722,10 @@ class KafkaSource:
673
722
 
674
723
  class AdjustSource:
675
724
  def handles_incrementality(self) -> bool:
676
- return True
725
+ return False
677
726
 
678
727
  def dlt_source(self, uri: str, table: str, **kwargs):
679
- if kwargs.get("incremental_key"):
728
+ if kwargs.get("incremental_key") and not table.startswith("custom:"):
680
729
  raise ValueError(
681
730
  "Adjust takes care of incrementality on its own, you should not provide incremental_key"
682
731
  )
@@ -688,25 +737,62 @@ class AdjustSource:
688
737
  if not api_key:
689
738
  raise ValueError("api_key in the URI is required to connect to Adjust")
690
739
 
691
- interval_start = kwargs.get("interval_start")
692
- interval_end = kwargs.get("interval_end")
740
+ lookback_days = int(source_params.get("lookback_days", [30])[0])
693
741
 
694
742
  start_date = (
695
- interval_start.strftime("%Y-%m-%d") if interval_start else "2000-01-01"
696
- )
697
- end_date = (
698
- interval_end.strftime("%Y-%m-%d")
699
- if interval_end
700
- else datetime.now().strftime("%Y-%m-%d")
743
+ pendulum.now()
744
+ .replace(hour=0, minute=0, second=0, microsecond=0)
745
+ .subtract(days=lookback_days)
701
746
  )
747
+ if kwargs.get("interval_start"):
748
+ start_date = (
749
+ ensure_pendulum_datetime(str(kwargs.get("interval_start")))
750
+ .replace(hour=0, minute=0, second=0, microsecond=0)
751
+ .subtract(days=lookback_days)
752
+ )
702
753
 
703
- Endpoint = None
704
- if table in ["campaigns", "creatives"]:
705
- Endpoint = table
754
+ end_date = pendulum.now()
755
+ if kwargs.get("interval_end"):
756
+ end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
757
+
758
+ dimensions = None
759
+ metrics = None
760
+ filters = []
761
+ if table.startswith("custom:"):
762
+ fields = table.split(":")
763
+ if len(fields) != 3 and len(fields) != 4:
764
+ raise ValueError(
765
+ "Invalid Adjust custom table format. Expected format: custom:<dimensions>,<metrics> or custom:<dimensions>:<metrics>:<filters>"
766
+ )
767
+
768
+ dimensions = fields[1].split(",")
769
+ metrics = fields[2].split(",")
770
+ table = "custom"
771
+
772
+ found = False
773
+ for dimension in dimensions:
774
+ if dimension in REQUIRED_CUSTOM_DIMENSIONS:
775
+ found = True
776
+ break
777
+
778
+ if not found:
779
+ raise ValueError(
780
+ f"At least one of the required dimensions is missing for custom Adjust report: {REQUIRED_CUSTOM_DIMENSIONS}"
781
+ )
782
+
783
+ if len(fields) == 4:
784
+ filters_raw = fields[3]
785
+ filters = parse_filters(filters_raw)
706
786
 
707
787
  return adjust_source(
708
- start_date=start_date, end_date=end_date, api_key=api_key[0]
709
- ).with_resources(Endpoint)
788
+ start_date=start_date,
789
+ end_date=end_date,
790
+ api_key=api_key[0],
791
+ dimensions=dimensions,
792
+ metrics=metrics,
793
+ merge_key=kwargs.get("merge_key"),
794
+ filters=filters,
795
+ ).with_resources(table)
710
796
 
711
797
 
712
798
  class AppsflyerSource:
@@ -0,0 +1 @@
1
+ __version__ = "0.9.3"
@@ -132,6 +132,9 @@ classifiers = [
132
132
  oracle = [
133
133
  "cx_Oracle==8.3.0",
134
134
  ]
135
+ odbc = [
136
+ "pyodbc==5.1.0",
137
+ ]
135
138
 
136
139
  [project.urls]
137
140
  Homepage = "https://github.com/bruin-data/ingestr"
@@ -11,7 +11,6 @@ psycopg2-binary==2.9.9
11
11
  py-machineid==0.5.1
12
12
  pymongo==4.6.3
13
13
  pymysql==1.1.0
14
- pyodbc==5.1.0
15
14
  pyrate-limiter==3.6.1
16
15
  redshift-connector==2.1.0
17
16
  rich==13.7.1
@@ -1,30 +0,0 @@
1
- # Adjust
2
-
3
- [Adjust](https://www.adjust.com/) is a mobile marketing analytics platform that provides solutions for measuring and optimizing campaigns, as well as protecting user data.
4
-
5
- ingestr supports Adjust as a source.
6
-
7
- ## URI Format
8
-
9
- The URI format for Adjust is as follows:
10
-
11
- ```plaintext
12
- adjust://?api_key=<api-key-here>
13
- ```
14
-
15
- An API token is required to retrieve reports from the Adjust reporting API. please follow the guide to [obtain a API key](https://dev.adjust.com/en/api/rs-api/authentication/).
16
-
17
- Once you complete the guide, you should have an API key. Let's say your API key is `nr_123`, here's a sample command that will copy the data from Adjust into a duckdb database:
18
-
19
- ```sh
20
- ingestr ingest --source-uri 'adjust://?api_key=nr_123' --source-table 'campaigns' --dest-uri duckdb:///adjust.duckdb --dest-table 'adjust.output' --interval-start '2024-09-05' --interval-end '2024-09-08'
21
- ```
22
-
23
- The result of this command will be a table in the `adjust.duckdb` database
24
-
25
- Available Source Table:
26
- Adjust source allows ingesting the following source into separate tables:
27
-
28
- -`Campaigns`: Retrieves data for a campaign, showing the app's revenue and network costs over multiple days.
29
-
30
- --`Creatives`: Retrieves data for a creative assest, detailing the app's revenue and network costs across multiple days
@@ -1,31 +0,0 @@
1
- from typing import Sequence
2
-
3
- import dlt
4
- from dlt.sources import DltResource
5
-
6
- from .helpers import DEFAULT_DIMENSIONS, AdjustAPI
7
-
8
-
9
- @dlt.source(max_table_nesting=0)
10
- def adjust_source(
11
- start_date: str,
12
- end_date: str,
13
- api_key: str,
14
- ) -> Sequence[DltResource]:
15
- @dlt.resource(write_disposition="merge", merge_key="day")
16
- def campaigns():
17
- adjust_api = AdjustAPI(api_key=api_key)
18
- yield from adjust_api.fetch_report_data(
19
- start_date=start_date,
20
- end_date=end_date,
21
- )
22
-
23
- @dlt.resource(write_disposition="merge", merge_key="day")
24
- def creatives():
25
- dimensions = DEFAULT_DIMENSIONS + ["adgroup", "creative"]
26
- adjust_api = AdjustAPI(api_key=api_key)
27
- yield from adjust_api.fetch_report_data(
28
- start_date=start_date, end_date=end_date, dimensions=dimensions
29
- )
30
-
31
- return campaigns, creatives
@@ -1 +0,0 @@
1
- __version__ = "0.9.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes