ingestify 0.0.1__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. ingestify-0.1.0/PKG-INFO +254 -0
  2. ingestify-0.1.0/README.md +244 -0
  3. ingestify-0.1.0/ingestify/__init__.py +11 -0
  4. ingestify-0.1.0/ingestify/application/__init__.py +0 -0
  5. ingestify-0.1.0/ingestify/application/dataset_store.py +339 -0
  6. ingestify-0.1.0/ingestify/application/ingestion_engine.py +62 -0
  7. ingestify-0.1.0/ingestify/application/loader.py +329 -0
  8. ingestify-0.1.0/ingestify/application/secrets_manager.py +53 -0
  9. ingestify-0.1.0/ingestify/cmdline.py +283 -0
  10. ingestify-0.1.0/ingestify/domain/__init__.py +2 -0
  11. ingestify-0.1.0/ingestify/domain/models/__init__.py +45 -0
  12. ingestify-0.1.0/ingestify/domain/models/data_spec_version_collection.py +33 -0
  13. ingestify-0.1.0/ingestify/domain/models/dataset/__init__.py +27 -0
  14. ingestify-0.1.0/ingestify/domain/models/dataset/collection.py +44 -0
  15. ingestify-0.1.0/ingestify/domain/models/dataset/collection_metadata.py +13 -0
  16. ingestify-0.1.0/ingestify/domain/models/dataset/dataset.py +104 -0
  17. ingestify-0.1.0/ingestify/domain/models/dataset/dataset_repository.py +46 -0
  18. ingestify-0.1.0/ingestify/domain/models/dataset/events.py +31 -0
  19. ingestify-0.1.0/ingestify/domain/models/dataset/file.py +146 -0
  20. ingestify-0.1.0/ingestify/domain/models/dataset/file_collection.py +35 -0
  21. ingestify-0.1.0/ingestify/domain/models/dataset/file_repository.py +59 -0
  22. ingestify-0.1.0/ingestify/domain/models/dataset/identifier.py +24 -0
  23. ingestify-0.1.0/ingestify/domain/models/dataset/revision.py +29 -0
  24. ingestify-0.1.0/ingestify/domain/models/dataset/selector.py +37 -0
  25. ingestify-0.1.0/ingestify/domain/models/event/__init__.py +4 -0
  26. ingestify-0.1.0/ingestify/domain/models/event/_old_event.py +21 -0
  27. ingestify-0.1.0/ingestify/domain/models/event/dispatcher.py +8 -0
  28. ingestify-0.1.0/ingestify/domain/models/event/domain_event.py +10 -0
  29. ingestify-0.1.0/ingestify/domain/models/event/event_bus.py +24 -0
  30. ingestify-0.1.0/ingestify/domain/models/event/publisher.py +23 -0
  31. ingestify-0.1.0/ingestify/domain/models/event/subscriber.py +39 -0
  32. ingestify-0.1.0/ingestify/domain/models/extract_job.py +23 -0
  33. ingestify-0.1.0/ingestify/domain/models/fetch_policy.py +40 -0
  34. ingestify-0.1.0/ingestify/domain/models/resources/__init__.py +1 -0
  35. ingestify-0.1.0/ingestify/domain/models/resources/dataset_resource.py +99 -0
  36. ingestify-0.1.0/ingestify/domain/models/sink.py +16 -0
  37. ingestify-0.1.0/ingestify/domain/models/source.py +34 -0
  38. ingestify-0.1.0/ingestify/domain/models/task/__init__.py +4 -0
  39. ingestify-0.1.0/ingestify/domain/models/task/set.py +21 -0
  40. ingestify-0.1.0/ingestify/domain/models/task/task.py +7 -0
  41. ingestify-0.1.0/ingestify/domain/services/__init__.py +0 -0
  42. ingestify-0.1.0/ingestify/domain/services/transformers/__init__.py +0 -0
  43. ingestify-0.1.0/ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
  44. ingestify-0.1.0/ingestify/exceptions.py +10 -0
  45. ingestify-0.1.0/ingestify/infra/__init__.py +4 -0
  46. ingestify-0.1.0/ingestify/infra/fetch/__init__.py +0 -0
  47. ingestify-0.1.0/ingestify/infra/fetch/http.py +100 -0
  48. ingestify-0.1.0/ingestify/infra/serialization/__init__.py +50 -0
  49. ingestify-0.1.0/ingestify/infra/sink/__init__.py +0 -0
  50. ingestify-0.1.0/ingestify/infra/sink/postgresql.py +50 -0
  51. ingestify-0.1.0/ingestify/infra/source/__init__.py +0 -0
  52. ingestify-0.1.0/ingestify/infra/source/statsbomb_github.py +92 -0
  53. ingestify-0.1.0/ingestify/infra/source/wyscout.py +175 -0
  54. ingestify-0.1.0/ingestify/infra/store/__init__.py +2 -0
  55. ingestify-0.1.0/ingestify/infra/store/dataset/__init__.py +2 -0
  56. ingestify-0.1.0/ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
  57. ingestify-0.1.0/ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
  58. ingestify-0.1.0/ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
  59. ingestify-0.1.0/ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
  60. ingestify-0.1.0/ingestify/infra/store/file/__init__.py +2 -0
  61. ingestify-0.1.0/ingestify/infra/store/file/local_file_repository.py +32 -0
  62. ingestify-0.1.0/ingestify/infra/store/file/s3_file_repository.py +50 -0
  63. ingestify-0.1.0/ingestify/main.py +205 -0
  64. ingestify-0.1.0/ingestify/server.py +78 -0
  65. ingestify-0.1.0/ingestify/source_base.py +23 -0
  66. ingestify-0.1.0/ingestify/static/templates/statsbomb_github/README.md +0 -0
  67. ingestify-0.1.0/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
  68. ingestify-0.1.0/ingestify/static/templates/statsbomb_github/database/README.md +1 -0
  69. ingestify-0.1.0/ingestify/static/templates/statsbomb_github/query.py +14 -0
  70. ingestify-0.1.0/ingestify/static/templates/wyscout/.env +5 -0
  71. ingestify-0.1.0/ingestify/static/templates/wyscout/.gitignore +2 -0
  72. ingestify-0.1.0/ingestify/static/templates/wyscout/README.md +0 -0
  73. ingestify-0.1.0/ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
  74. ingestify-0.1.0/ingestify/static/templates/wyscout/database/README.md +1 -0
  75. ingestify-0.1.0/ingestify/static/templates/wyscout/query.py +14 -0
  76. ingestify-0.1.0/ingestify/utils.py +276 -0
  77. ingestify-0.1.0/ingestify.egg-info/PKG-INFO +254 -0
  78. ingestify-0.1.0/ingestify.egg-info/SOURCES.txt +82 -0
  79. ingestify-0.1.0/ingestify.egg-info/dependency_links.txt +1 -0
  80. ingestify-0.1.0/ingestify.egg-info/entry_points.txt +2 -0
  81. ingestify-0.1.0/ingestify.egg-info/requires.txt +13 -0
  82. ingestify-0.1.0/ingestify.egg-info/top_level.txt +1 -0
  83. ingestify-0.1.0/setup.cfg +4 -0
  84. ingestify-0.1.0/setup.py +54 -0
  85. ingestify-0.0.1/PKG-INFO +0 -10
  86. ingestify-0.0.1/setup.py +0 -15
@@ -0,0 +1,254 @@
1
+ Metadata-Version: 2.1
2
+ Name: ingestify
3
+ Version: 0.1.0
4
+ Summary: Standardizing soccer tracking- and event data
5
+ Author: Koen Vossen
6
+ Author-email: info@koenvossen.nl
7
+ License: AGPL
8
+ Description-Content-Type: text/markdown
9
+ Provides-Extra: test
10
+
11
+ # Ingestify
12
+
13
+ ## Data Management Platform
14
+
15
+ In general a data management platform contains:
16
+ 1. Ingestion of data (Extract from Source into Load into Data Lake)
17
+ 2. Transformation of data (Extract from Data Lake, Transform and Load into Data Warehouse)
18
+ 3. Utilization of data
19
+
20
+ <img src="https://www.getdbt.com/ui/img/blog/what-exactly-is-dbt/1-BogoeTTK1OXFU1hPfUyCFw.png" />
21
+ Source: https://www.getdbt.com/blog/what-exactly-is-dbt/
22
+
23
+ TODO: Improve drawings and explain more
24
+
25
+ ## Ingestify
26
+
27
+ Ingestify focus' on Ingestion of data.
28
+
29
+ ### How does Ingestify work?
30
+
31
+ 1. A `Source` is asked for all available `Datasets` using the `discover_datasets` method
32
+ 2. All available `Datasets` are compared with what's already fetched, and if it's changed (using a `FetchPolicy`)
33
+ 3. A `TaskQueue` is filled with `Tasks` to fetch all missing or stale `Datasets`
34
+
35
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
36
+
37
+ - [Source](blob/main/ingestify/domain/models/source.py) is the main entrance from Ingestify to external sources. A Source must always define:
38
+ - `discover_datasets` - Creates a list of all available datasets on the Source
39
+ - `fetch_dataset_files` - Fetches a single dataset for a Source
40
+ - [Dataset Store](blob/main/ingestify/application/dataset_store.py) manages the access to the Metadata storage and the file storage. It keeps track of versions, and knows how to load data.
41
+ - [Loader](blob/main/ingestify/application/loader.py) organizes the fetching process. It does this by executing the following steps:
42
+ 1. Ask `Source` for all available datasets for a selector
43
+ 2. Ask `Dataset Store` for all available datasets for a selector
44
+ 3. Determines missing `Datasets`
45
+ 4. Create tasks for data retrieval and puts in `TaskQueue`
46
+ 5. Use multiprocessing to execute all tasks
47
+
48
+ ## Get started
49
+
50
+ ### Install
51
+
52
+ Make sure you have installed the latest version:
53
+ ```bash
54
+ pip install git+https://github.com/PySport/ingestify.git
55
+
56
+ # OR
57
+
58
+ pip install git+ssh://git@github.com/PySport/ingestify.git
59
+ ```
60
+
61
+ ### Using a template
62
+
63
+ Ingestify provides some templates to get started quickly. When using `ingestify init` a new project will be created and example files are copied.
64
+ Currently, Ingestify offers a `statsbomb_github` and `wyscout` template.
65
+
66
+ #### Statsbomb Github
67
+
68
+ This uses https://github.com/statsbomb/open-data as source and syncs some competitions.
69
+
70
+ ```
71
+ bash# ingestify init --template statsbomb_github /tmp/ingestify-test
72
+
73
+ 2023-05-23 08:57:51,250 [INFO] ingestify.cmdline: Initialized project at `/tmp/ingestify-test` with template `statsbomb_github`
74
+ ```
75
+
76
+ #### Wyscout
77
+
78
+ This requires valid Wyscout credentials. The templates includes some security best practices like using a `.env` file for credentials which isn't part of version control.
79
+
80
+ ```
81
+ bash# ingestify init --template wyscout /tmp/ingestify-test
82
+
83
+ 2023-05-23 08:58:18,720 [INFO] ingestify.cmdline: Initialized project at `/tmp/ingestify-test` with template `wyscout`
84
+ ```
85
+
86
+ ### Running Ingestify
87
+
88
+ To actually run Ingestify you first change the current directory to the project directory.
89
+
90
+ Then run:
91
+ ```bash
92
+ bash# ingestify run
93
+
94
+ 2023-05-23 08:59:07,066 [INFO] ingestify.main: Initializing sources
95
+ 2023-05-23 08:59:07,068 [INFO] ingestify.main: Initializing IngestionEngine
96
+ 2023-05-23 08:59:07,086 [INFO] ingestify.main: Determining tasks...
97
+ 2023-05-23 08:59:07,364 [INFO] ingestify.application.loader: Discovered 33 datasets from StatsbombGithub using selector competition_id=11/season_id=42 => 33 tasks. 0 skipped.
98
+ 2023-05-23 08:59:07,625 [INFO] ingestify.application.loader: Discovered 35 datasets from StatsbombGithub using selector competition_id=11/season_id=90 => 35 tasks. 0 skipped.
99
+ 2023-05-23 08:59:07,625 [INFO] ingestify.application.loader: Scheduled 68 tasks. With 10 processes
100
+ 2023-05-23 08:59:07,654 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303516)
101
+ 2023-05-23 08:59:07,654 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303731)
102
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303430)
103
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303504)
104
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303421)
105
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303400)
106
+ 2023-05-23 08:59:07,656 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303664)
107
+ 2023-05-23 08:59:07,656 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303680)
108
+ 2023-05-23 08:59:07,657 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303487)
109
+ 2023-05-23 08:59:07,658 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303615)
110
+ 2023-05-23 08:59:08,419 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303532)
111
+ 2023-05-23 08:59:08,421 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303682)
112
+ 2023-05-23 08:59:08,444 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303451)
113
+ 2023-05-23 08:59:08,462 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303596)
114
+ 2023-05-23 08:59:08,518 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303634)
115
+ 2023-05-23 08:59:08,528 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303479)
116
+ 2023-05-23 08:59:08,541 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303696)
117
+ 2023-05-23 08:59:08,638 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303725)
118
+ 2023-05-23 08:59:08,684 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303600)
119
+ 2023-05-23 08:59:08,962 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303493)
120
+ 2023-05-23 08:59:09,270 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303548)
121
+ 2023-05-23 08:59:09,276 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303674)
122
+ 2023-05-23 08:59:09,292 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303700)
123
+ 2023-05-23 08:59:09,332 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303666)
124
+ 2023-05-23 08:59:09,411 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303377)
125
+ 2023-05-23 08:59:09,462 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303517)
126
+ 2023-05-23 08:59:09,491 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303473)
127
+ 2023-05-23 08:59:09,511 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773631)
128
+ 2023-05-23 08:59:09,726 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773497)
129
+ 2023-05-23 08:59:09,757 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773593)
130
+ 2023-05-23 08:59:09,957 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303652)
131
+ 2023-05-23 08:59:09,999 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303715)
132
+ 2023-05-23 08:59:10,075 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303470)
133
+ 2023-05-23 08:59:10,103 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303707)
134
+ 2023-05-23 08:59:10,188 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773457)
135
+ 2023-05-23 08:59:10,248 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303524)
136
+ 2023-05-23 08:59:10,282 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773665)
137
+ 2023-05-23 08:59:10,411 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303610)
138
+ 2023-05-23 08:59:10,563 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773466)
139
+ 2023-05-23 08:59:10,711 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773585)
140
+ 2023-05-23 08:59:10,768 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773672)
141
+ 2023-05-23 08:59:10,778 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773565)
142
+ 2023-05-23 08:59:10,867 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773660)
143
+ 2023-05-23 08:59:10,954 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773656)
144
+ 2023-05-23 08:59:10,974 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773586)
145
+ 2023-05-23 08:59:11,026 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773387)
146
+ 2023-05-23 08:59:11,136 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773369)
147
+ 2023-05-23 08:59:11,438 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773552)
148
+ 2023-05-23 08:59:11,515 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773597)
149
+ 2023-05-23 08:59:11,586 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773571)
150
+ 2023-05-23 08:59:11,610 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773587)
151
+ 2023-05-23 08:59:11,690 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773386)
152
+ 2023-05-23 08:59:11,727 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773377)
153
+ 2023-05-23 08:59:11,757 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773372)
154
+ 2023-05-23 08:59:11,899 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3764661)
155
+ 2023-05-23 08:59:11,901 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773695)
156
+ 2023-05-23 08:59:12,006 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773661)
157
+ 2023-05-23 08:59:12,186 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773474)
158
+ 2023-05-23 08:59:12,283 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773523)
159
+ 2023-05-23 08:59:12,339 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773403)
160
+ 2023-05-23 08:59:12,426 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773428)
161
+ 2023-05-23 08:59:12,582 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773415)
162
+ 2023-05-23 08:59:12,583 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773689)
163
+ 2023-05-23 08:59:12,705 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773526)
164
+ 2023-05-23 08:59:13,510 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773477)
165
+ 2023-05-23 08:59:13,538 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3764440)
166
+ 2023-05-23 08:59:13,592 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773625)
167
+ 2023-05-23 08:59:15,017 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773547)
168
+ 2023-05-23 08:59:15,917 [INFO] ingestify.cmdline: Done
169
+ ```
170
+
171
+ When we run it for the second time:
172
+ ```bash
173
+ bash# ingestify run
174
+
175
+ 2023-05-23 08:59:48,001 [INFO] ingestify.main: Initializing sources
176
+ 2023-05-23 08:59:48,002 [INFO] ingestify.main: Initializing IngestionEngine
177
+ 2023-05-23 08:59:48,006 [INFO] ingestify.main: Determining tasks...
178
+ 2023-05-23 08:59:48,067 [INFO] ingestify.application.loader: Discovered 33 datasets from StatsbombGithub using selector competition_id=11/season_id=42 => 0 tasks. 33 skipped.
179
+ 2023-05-23 08:59:48,118 [INFO] ingestify.application.loader: Discovered 35 datasets from StatsbombGithub using selector competition_id=11/season_id=90 => 0 tasks. 35 skipped.
180
+ 2023-05-23 08:59:48,118 [INFO] ingestify.application.loader: Nothing to do.
181
+ 2023-05-23 08:59:48,119 [INFO] ingestify.cmdline: Done
182
+ ```
183
+
184
+ ## Using the data
185
+
186
+ The project contains a `query.py` file with an example of how to use the data.
187
+
188
+ ```bash
189
+ bash# python query.py
190
+
191
+ Loaded dataset with 3702 events
192
+ Loaded dataset with 3994 events
193
+ Loaded dataset with 3831 events
194
+ Loaded dataset with 3647 events
195
+ Loaded dataset with 4062 events
196
+ Loaded dataset with 4051 events
197
+
198
+ .....
199
+
200
+ ```
201
+
202
+
203
+ How to go from raw data to parquet files:
204
+
205
+ ```python
206
+ from ingestify.main import get_datastore
207
+
208
+ store = get_datastore("config.yaml")
209
+
210
+ dataset_collection = store.get_dataset_collection(
211
+ provider="statsbomb", stage="raw"
212
+ )
213
+
214
+ # Store.map is using multiprocessing by default
215
+ store.map(
216
+ lambda dataset: (
217
+ store
218
+
219
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
220
+ .load_with_kloppy(dataset)
221
+
222
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
223
+ .to_df(
224
+ "*",
225
+ match_id=dataset.identifier.match_id,
226
+ competition_id=dataset.identifier.competition_id,
227
+ season_id=dataset.identifier.season_id,
228
+
229
+ engine="polars"
230
+ )
231
+
232
+ # Write to parquet format
233
+ .write_parquet(
234
+ f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
235
+ )
236
+ ),
237
+ dataset_collection,
238
+ )
239
+
240
+ # TODO:
241
+ # - when a file is written in parquet format (on any other format) it should be added as such to the store.
242
+ ```
243
+
244
+
245
+ ## Future work
246
+
247
+ Some future work include:
248
+ - Workflow tools - Run custom workflows using with tools like [Airflow](https://airflow.apache.org/), [Dagster](https://docs.dagster.io/getting-started), [Prefect](https://www.prefect.io/), [DBT](https://www.getdbt.com/)
249
+ - Execution engines - Run tasks on other execution engines like [AWS Lambda](https://aws.amazon.com/lambda/), [Dask](https://www.dask.org/)
250
+ - Lineage - Keep track of lineage with tools like [SQLLineage](https://sqllineage.readthedocs.io/en/latest/index.html)
251
+ - Data quality - Monitor data quality with tools like [Great Expectations](https://docs.greatexpectations.io/docs/tutorials/quickstart/)
252
+ - Event Bus - Automatically publish events to external systems like [AWS Event Bridge](https://aws.amazon.com/eventbridge/), [Azure Event Grid](https://learn.microsoft.com/en-us/azure/event-grid/overview), [Google Cloud Pub/Sub](https://cloud.google.com/pubsub/docs/overview), [Kafka](https://kafka.apache.org/), [RabbitMQ](https://www.rabbitmq.com/)
253
+ - Query Engines - Integrate with query engines to run SQL queries directly on the store using tools like [DuckDB](https://duckdb.org/), [DataBend](https://databend.rs/), [DataFusion](https://arrow.apache.org/datafusion/), [Polars](https://www.pola.rs/), [Spark](https://spark.apache.org/)
254
+ - Streaming Data - Ingest streaming data
@@ -0,0 +1,244 @@
1
+ # Ingestify
2
+
3
+ ## Data Management Platform
4
+
5
+ In general a data management platform contains:
6
+ 1. Ingestion of data (Extract from Source into Load into Data Lake)
7
+ 2. Transformation of data (Extract from Data Lake, Transform and Load into Data Warehouse)
8
+ 3. Utilization of data
9
+
10
+ <img src="https://www.getdbt.com/ui/img/blog/what-exactly-is-dbt/1-BogoeTTK1OXFU1hPfUyCFw.png" />
11
+ Source: https://www.getdbt.com/blog/what-exactly-is-dbt/
12
+
13
+ TODO: Improve drawings and explain more
14
+
15
+ ## Ingestify
16
+
17
+ Ingestify focus' on Ingestion of data.
18
+
19
+ ### How does Ingestify work?
20
+
21
+ 1. A `Source` is asked for all available `Datasets` using the `discover_datasets` method
22
+ 2. All available `Datasets` are compared with what's already fetched, and if it's changed (using a `FetchPolicy`)
23
+ 3. A `TaskQueue` is filled with `Tasks` to fetch all missing or stale `Datasets`
24
+
25
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
26
+
27
+ - [Source](blob/main/ingestify/domain/models/source.py) is the main entrance from Ingestify to external sources. A Source must always define:
28
+ - `discover_datasets` - Creates a list of all available datasets on the Source
29
+ - `fetch_dataset_files` - Fetches a single dataset for a Source
30
+ - [Dataset Store](blob/main/ingestify/application/dataset_store.py) manages the access to the Metadata storage and the file storage. It keeps track of versions, and knows how to load data.
31
+ - [Loader](blob/main/ingestify/application/loader.py) organizes the fetching process. It does this by executing the following steps:
32
+ 1. Ask `Source` for all available datasets for a selector
33
+ 2. Ask `Dataset Store` for all available datasets for a selector
34
+ 3. Determines missing `Datasets`
35
+ 4. Create tasks for data retrieval and puts in `TaskQueue`
36
+ 5. Use multiprocessing to execute all tasks
37
+
38
+ ## Get started
39
+
40
+ ### Install
41
+
42
+ Make sure you have installed the latest version:
43
+ ```bash
44
+ pip install git+https://github.com/PySport/ingestify.git
45
+
46
+ # OR
47
+
48
+ pip install git+ssh://git@github.com/PySport/ingestify.git
49
+ ```
50
+
51
+ ### Using a template
52
+
53
+ Ingestify provides some templates to get started quickly. When using `ingestify init` a new project will be created and example files are copied.
54
+ Currently, Ingestify offers a `statsbomb_github` and `wyscout` template.
55
+
56
+ #### Statsbomb Github
57
+
58
+ This uses https://github.com/statsbomb/open-data as source and syncs some competitions.
59
+
60
+ ```
61
+ bash# ingestify init --template statsbomb_github /tmp/ingestify-test
62
+
63
+ 2023-05-23 08:57:51,250 [INFO] ingestify.cmdline: Initialized project at `/tmp/ingestify-test` with template `statsbomb_github`
64
+ ```
65
+
66
+ #### Wyscout
67
+
68
+ This requires valid Wyscout credentials. The templates includes some security best practices like using a `.env` file for credentials which isn't part of version control.
69
+
70
+ ```
71
+ bash# ingestify init --template wyscout /tmp/ingestify-test
72
+
73
+ 2023-05-23 08:58:18,720 [INFO] ingestify.cmdline: Initialized project at `/tmp/ingestify-test` with template `wyscout`
74
+ ```
75
+
76
+ ### Running Ingestify
77
+
78
+ To actually run Ingestify you first change the current directory to the project directory.
79
+
80
+ Then run:
81
+ ```bash
82
+ bash# ingestify run
83
+
84
+ 2023-05-23 08:59:07,066 [INFO] ingestify.main: Initializing sources
85
+ 2023-05-23 08:59:07,068 [INFO] ingestify.main: Initializing IngestionEngine
86
+ 2023-05-23 08:59:07,086 [INFO] ingestify.main: Determining tasks...
87
+ 2023-05-23 08:59:07,364 [INFO] ingestify.application.loader: Discovered 33 datasets from StatsbombGithub using selector competition_id=11/season_id=42 => 33 tasks. 0 skipped.
88
+ 2023-05-23 08:59:07,625 [INFO] ingestify.application.loader: Discovered 35 datasets from StatsbombGithub using selector competition_id=11/season_id=90 => 35 tasks. 0 skipped.
89
+ 2023-05-23 08:59:07,625 [INFO] ingestify.application.loader: Scheduled 68 tasks. With 10 processes
90
+ 2023-05-23 08:59:07,654 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303516)
91
+ 2023-05-23 08:59:07,654 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303731)
92
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303430)
93
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303504)
94
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303421)
95
+ 2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303400)
96
+ 2023-05-23 08:59:07,656 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303664)
97
+ 2023-05-23 08:59:07,656 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303680)
98
+ 2023-05-23 08:59:07,657 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303487)
99
+ 2023-05-23 08:59:07,658 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303615)
100
+ 2023-05-23 08:59:08,419 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303532)
101
+ 2023-05-23 08:59:08,421 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303682)
102
+ 2023-05-23 08:59:08,444 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303451)
103
+ 2023-05-23 08:59:08,462 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303596)
104
+ 2023-05-23 08:59:08,518 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303634)
105
+ 2023-05-23 08:59:08,528 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303479)
106
+ 2023-05-23 08:59:08,541 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303696)
107
+ 2023-05-23 08:59:08,638 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303725)
108
+ 2023-05-23 08:59:08,684 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303600)
109
+ 2023-05-23 08:59:08,962 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303493)
110
+ 2023-05-23 08:59:09,270 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303548)
111
+ 2023-05-23 08:59:09,276 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303674)
112
+ 2023-05-23 08:59:09,292 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303700)
113
+ 2023-05-23 08:59:09,332 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303666)
114
+ 2023-05-23 08:59:09,411 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303377)
115
+ 2023-05-23 08:59:09,462 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303517)
116
+ 2023-05-23 08:59:09,491 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303473)
117
+ 2023-05-23 08:59:09,511 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773631)
118
+ 2023-05-23 08:59:09,726 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773497)
119
+ 2023-05-23 08:59:09,757 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773593)
120
+ 2023-05-23 08:59:09,957 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303652)
121
+ 2023-05-23 08:59:09,999 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303715)
122
+ 2023-05-23 08:59:10,075 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303470)
123
+ 2023-05-23 08:59:10,103 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303707)
124
+ 2023-05-23 08:59:10,188 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773457)
125
+ 2023-05-23 08:59:10,248 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303524)
126
+ 2023-05-23 08:59:10,282 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773665)
127
+ 2023-05-23 08:59:10,411 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303610)
128
+ 2023-05-23 08:59:10,563 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773466)
129
+ 2023-05-23 08:59:10,711 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773585)
130
+ 2023-05-23 08:59:10,768 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773672)
131
+ 2023-05-23 08:59:10,778 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773565)
132
+ 2023-05-23 08:59:10,867 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773660)
133
+ 2023-05-23 08:59:10,954 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773656)
134
+ 2023-05-23 08:59:10,974 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773586)
135
+ 2023-05-23 08:59:11,026 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773387)
136
+ 2023-05-23 08:59:11,136 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773369)
137
+ 2023-05-23 08:59:11,438 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773552)
138
+ 2023-05-23 08:59:11,515 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773597)
139
+ 2023-05-23 08:59:11,586 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773571)
140
+ 2023-05-23 08:59:11,610 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773587)
141
+ 2023-05-23 08:59:11,690 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773386)
142
+ 2023-05-23 08:59:11,727 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773377)
143
+ 2023-05-23 08:59:11,757 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773372)
144
+ 2023-05-23 08:59:11,899 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3764661)
145
+ 2023-05-23 08:59:11,901 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773695)
146
+ 2023-05-23 08:59:12,006 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773661)
147
+ 2023-05-23 08:59:12,186 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773474)
148
+ 2023-05-23 08:59:12,283 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773523)
149
+ 2023-05-23 08:59:12,339 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773403)
150
+ 2023-05-23 08:59:12,426 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773428)
151
+ 2023-05-23 08:59:12,582 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773415)
152
+ 2023-05-23 08:59:12,583 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773689)
153
+ 2023-05-23 08:59:12,705 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773526)
154
+ 2023-05-23 08:59:13,510 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773477)
155
+ 2023-05-23 08:59:13,538 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3764440)
156
+ 2023-05-23 08:59:13,592 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773625)
157
+ 2023-05-23 08:59:15,017 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773547)
158
+ 2023-05-23 08:59:15,917 [INFO] ingestify.cmdline: Done
159
+ ```
160
+
161
+ When we run it for the second time:
162
+ ```bash
163
+ bash# ingestify run
164
+
165
+ 2023-05-23 08:59:48,001 [INFO] ingestify.main: Initializing sources
166
+ 2023-05-23 08:59:48,002 [INFO] ingestify.main: Initializing IngestionEngine
167
+ 2023-05-23 08:59:48,006 [INFO] ingestify.main: Determining tasks...
168
+ 2023-05-23 08:59:48,067 [INFO] ingestify.application.loader: Discovered 33 datasets from StatsbombGithub using selector competition_id=11/season_id=42 => 0 tasks. 33 skipped.
169
+ 2023-05-23 08:59:48,118 [INFO] ingestify.application.loader: Discovered 35 datasets from StatsbombGithub using selector competition_id=11/season_id=90 => 0 tasks. 35 skipped.
170
+ 2023-05-23 08:59:48,118 [INFO] ingestify.application.loader: Nothing to do.
171
+ 2023-05-23 08:59:48,119 [INFO] ingestify.cmdline: Done
172
+ ```
173
+
174
+ ## Using the data
175
+
176
+ The project contains a `query.py` file with an example of how to use the data.
177
+
178
+ ```bash
179
+ bash# python query.py
180
+
181
+ Loaded dataset with 3702 events
182
+ Loaded dataset with 3994 events
183
+ Loaded dataset with 3831 events
184
+ Loaded dataset with 3647 events
185
+ Loaded dataset with 4062 events
186
+ Loaded dataset with 4051 events
187
+
188
+ .....
189
+
190
+ ```
191
+
192
+
193
+ How to go from raw data to parquet files:
194
+
195
+ ```python
196
+ from ingestify.main import get_datastore
197
+
198
+ store = get_datastore("config.yaml")
199
+
200
+ dataset_collection = store.get_dataset_collection(
201
+ provider="statsbomb", stage="raw"
202
+ )
203
+
204
+ # Store.map is using multiprocessing by default
205
+ store.map(
206
+ lambda dataset: (
207
+ store
208
+
209
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
210
+ .load_with_kloppy(dataset)
211
+
212
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
213
+ .to_df(
214
+ "*",
215
+ match_id=dataset.identifier.match_id,
216
+ competition_id=dataset.identifier.competition_id,
217
+ season_id=dataset.identifier.season_id,
218
+
219
+ engine="polars"
220
+ )
221
+
222
+ # Write to parquet format
223
+ .write_parquet(
224
+ f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
225
+ )
226
+ ),
227
+ dataset_collection,
228
+ )
229
+
230
+ # TODO:
231
+ # - when a file is written in parquet format (on any other format) it should be added as such to the store.
232
+ ```
233
+
234
+
235
+ ## Future work
236
+
237
+ Some future work include:
238
+ - Workflow tools - Run custom workflows using with tools like [Airflow](https://airflow.apache.org/), [Dagster](https://docs.dagster.io/getting-started), [Prefect](https://www.prefect.io/), [DBT](https://www.getdbt.com/)
239
+ - Execution engines - Run tasks on other execution engines like [AWS Lambda](https://aws.amazon.com/lambda/), [Dask](https://www.dask.org/)
240
+ - Lineage - Keep track of lineage with tools like [SQLLineage](https://sqllineage.readthedocs.io/en/latest/index.html)
241
+ - Data quality - Monitor data quality with tools like [Great Expectations](https://docs.greatexpectations.io/docs/tutorials/quickstart/)
242
+ - Event Bus - Automatically publish events to external systems like [AWS Event Bridge](https://aws.amazon.com/eventbridge/), [Azure Event Grid](https://learn.microsoft.com/en-us/azure/event-grid/overview), [Google Cloud Pub/Sub](https://cloud.google.com/pubsub/docs/overview), [Kafka](https://kafka.apache.org/), [RabbitMQ](https://www.rabbitmq.com/)
243
+ - Query Engines - Integrate with query engines to run SQL queries directly on the store using tools like [DuckDB](https://duckdb.org/), [DataBend](https://databend.rs/), [DataFusion](https://arrow.apache.org/datafusion/), [Polars](https://www.pola.rs/), [Spark](https://spark.apache.org/)
244
+ - Streaming Data - Ingest streaming data
@@ -0,0 +1,11 @@
1
+ # detect if we are imported from the setup procedure (borrowed from numpy code)
2
+ try:
3
+ __INGESTIFY_SETUP__
4
+ except NameError:
5
+ __INGESTIFY_SETUP__ = False
6
+
7
+ if not __INGESTIFY_SETUP__:
8
+ from .infra import retrieve_http
9
+ from .source_base import Source, DatasetResource
10
+
11
+ __version__ = "0.1.0"
File without changes