ingestify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. ingestify/__init__.py +11 -0
  2. ingestify/application/__init__.py +0 -0
  3. ingestify/application/dataset_store.py +339 -0
  4. ingestify/application/ingestion_engine.py +62 -0
  5. ingestify/application/loader.py +329 -0
  6. ingestify/application/secrets_manager.py +53 -0
  7. ingestify/cmdline.py +283 -0
  8. ingestify/domain/__init__.py +2 -0
  9. ingestify/domain/models/__init__.py +45 -0
  10. ingestify/domain/models/data_spec_version_collection.py +33 -0
  11. ingestify/domain/models/dataset/__init__.py +27 -0
  12. ingestify/domain/models/dataset/collection.py +44 -0
  13. ingestify/domain/models/dataset/collection_metadata.py +13 -0
  14. ingestify/domain/models/dataset/dataset.py +104 -0
  15. ingestify/domain/models/dataset/dataset_repository.py +46 -0
  16. ingestify/domain/models/dataset/events.py +31 -0
  17. ingestify/domain/models/dataset/file.py +146 -0
  18. ingestify/domain/models/dataset/file_collection.py +35 -0
  19. ingestify/domain/models/dataset/file_repository.py +59 -0
  20. ingestify/domain/models/dataset/identifier.py +24 -0
  21. ingestify/domain/models/dataset/revision.py +29 -0
  22. ingestify/domain/models/dataset/selector.py +37 -0
  23. ingestify/domain/models/event/__init__.py +4 -0
  24. ingestify/domain/models/event/_old_event.py +21 -0
  25. ingestify/domain/models/event/dispatcher.py +8 -0
  26. ingestify/domain/models/event/domain_event.py +10 -0
  27. ingestify/domain/models/event/event_bus.py +24 -0
  28. ingestify/domain/models/event/publisher.py +23 -0
  29. ingestify/domain/models/event/subscriber.py +39 -0
  30. ingestify/domain/models/extract_job.py +23 -0
  31. ingestify/domain/models/fetch_policy.py +40 -0
  32. ingestify/domain/models/resources/__init__.py +1 -0
  33. ingestify/domain/models/resources/dataset_resource.py +99 -0
  34. ingestify/domain/models/sink.py +16 -0
  35. ingestify/domain/models/source.py +34 -0
  36. ingestify/domain/models/task/__init__.py +4 -0
  37. ingestify/domain/models/task/set.py +21 -0
  38. ingestify/domain/models/task/task.py +7 -0
  39. ingestify/domain/services/__init__.py +0 -0
  40. ingestify/domain/services/transformers/__init__.py +0 -0
  41. ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
  42. ingestify/exceptions.py +10 -0
  43. ingestify/infra/__init__.py +4 -0
  44. ingestify/infra/fetch/__init__.py +0 -0
  45. ingestify/infra/fetch/http.py +100 -0
  46. ingestify/infra/serialization/__init__.py +50 -0
  47. ingestify/infra/sink/__init__.py +0 -0
  48. ingestify/infra/sink/postgresql.py +50 -0
  49. ingestify/infra/source/__init__.py +0 -0
  50. ingestify/infra/source/statsbomb_github.py +92 -0
  51. ingestify/infra/source/wyscout.py +175 -0
  52. ingestify/infra/store/__init__.py +2 -0
  53. ingestify/infra/store/dataset/__init__.py +2 -0
  54. ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
  55. ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
  56. ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
  57. ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
  58. ingestify/infra/store/file/__init__.py +2 -0
  59. ingestify/infra/store/file/local_file_repository.py +32 -0
  60. ingestify/infra/store/file/s3_file_repository.py +50 -0
  61. ingestify/main.py +205 -0
  62. ingestify/server.py +78 -0
  63. ingestify/source_base.py +23 -0
  64. ingestify/static/templates/statsbomb_github/README.md +0 -0
  65. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
  66. ingestify/static/templates/statsbomb_github/database/README.md +1 -0
  67. ingestify/static/templates/statsbomb_github/query.py +14 -0
  68. ingestify/static/templates/wyscout/.env +5 -0
  69. ingestify/static/templates/wyscout/.gitignore +2 -0
  70. ingestify/static/templates/wyscout/README.md +0 -0
  71. ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
  72. ingestify/static/templates/wyscout/database/README.md +1 -0
  73. ingestify/static/templates/wyscout/query.py +14 -0
  74. ingestify/utils.py +276 -0
  75. ingestify-0.1.0.dist-info/METADATA +265 -0
  76. ingestify-0.1.0.dist-info/RECORD +79 -0
  77. ingestify-0.1.0.dist-info/WHEEL +5 -0
  78. ingestify-0.1.0.dist-info/entry_points.txt +2 -0
  79. ingestify-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,92 @@
1
+ import json
2
+ from datetime import datetime
3
+
4
+ import requests
5
+
6
+ from ingestify import Source, retrieve_http
7
+ from ingestify.domain import DraftFile
8
+ from ingestify.domain.models.dataset.dataset import DatasetState
9
+
10
+ BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
11
+
12
+
13
+ class StatsbombGithub(Source):
14
+ provider = "statsbomb"
15
+
16
+ def discover_selectors(self, dataset_type: str, data_spec_versions: None = None):
17
+ assert dataset_type == "match"
18
+
19
+ competitions = requests.get(f"{BASE_URL}/competitions.json").json()
20
+ return [
21
+ dict(
22
+ competition_id=competition["competition_id"],
23
+ season_id=competition["season_id"],
24
+ )
25
+ for competition in competitions
26
+ ]
27
+
28
+ def discover_datasets(
29
+ self,
30
+ dataset_type,
31
+ competition_id: str = None,
32
+ season_id: str = None,
33
+ data_spec_versions=None,
34
+ ):
35
+ assert dataset_type == "match"
36
+
37
+ datasets = []
38
+
39
+ matches = requests.get(
40
+ f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
41
+ ).json()
42
+
43
+ for match in matches:
44
+ last_updated = match["last_updated"]
45
+ if "Z" not in last_updated:
46
+ # Assume UTC
47
+ last_updated += "Z"
48
+
49
+ last_modified = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
50
+
51
+ dataset = dict(
52
+ competition_id=competition_id,
53
+ season_id=season_id,
54
+ match_id=match["match_id"],
55
+ _last_modified=last_modified,
56
+ _match=match,
57
+ _metadata=match,
58
+ _state=DatasetState.COMPLETE,
59
+ )
60
+ datasets.append(dataset)
61
+ return datasets
62
+
63
+ def fetch_dataset_files(
64
+ self, dataset_type, identifier, current_revision, data_spec_versions
65
+ ):
66
+ assert dataset_type == "match"
67
+
68
+ current_files = current_revision.modified_files_map if current_revision else {}
69
+ files = {}
70
+ for filename, url in [
71
+ ("lineups.json", f"{BASE_URL}/lineups/{identifier.match_id}.json"),
72
+ ("events.json", f"{BASE_URL}/events/{identifier.match_id}.json"),
73
+ ]:
74
+ data_feed_key = filename.split(".")[0]
75
+ file_id = data_feed_key + "__v1"
76
+ files[file_id] = retrieve_http(
77
+ url,
78
+ current_files.get(filename),
79
+ file_data_feed_key=data_feed_key,
80
+ file_data_spec_version="v1",
81
+ file_data_serialization_format="json",
82
+ )
83
+
84
+ files["match__v1"] = DraftFile.from_input(
85
+ json.dumps(identifier._match, indent=4),
86
+ data_feed_key="match",
87
+ data_spec_version="v1",
88
+ data_serialization_format="json",
89
+ modified_at=None,
90
+ )
91
+
92
+ return files
@@ -0,0 +1,175 @@
1
+ import json
2
+ from typing import Optional, Dict, List
3
+
4
+ import requests
5
+
6
+ from ingestify import Source, retrieve_http
7
+ from ingestify.domain import DraftFile
8
+ from ingestify.exceptions import ConfigurationError
9
+
10
+ BASE_URL = "https://apirest.wyscout.com/v3"
11
+
12
+
13
+ def wyscout_pager_fn(url, response):
14
+ if response["meta"]["page_current"] < response["meta"]["page_count"]:
15
+ return f"{url}&page={response['meta']['page_current'] + 1}"
16
+ else:
17
+ return None
18
+
19
+
20
+ class Wyscout(Source):
21
+ def discover_selectors(self, dataset_type: str) -> List[Dict]:
22
+ raise NotImplementedError("Not implemented for Wyscout")
23
+
24
+ provider = "wyscout"
25
+
26
+ def __init__(self, name: str, username: str, password: str):
27
+ super().__init__(name)
28
+
29
+ self.username = username.strip()
30
+ self.password = password.strip()
31
+
32
+ if not self.username:
33
+ raise ConfigurationError(
34
+ f"Username of Wyscout source named '{self.name}' cannot be empty"
35
+ )
36
+
37
+ if not self.password:
38
+ raise ConfigurationError(
39
+ f"Username of Wyscout source named '{self.name}' cannot be empty"
40
+ )
41
+
42
+ def _get(self, path: str):
43
+ response = requests.get(
44
+ BASE_URL + path,
45
+ auth=(self.username, self.password),
46
+ )
47
+ if response.status_code == 400:
48
+ # What if the response isn't a json?
49
+ error = response.json()["error"]
50
+ raise ConfigurationError(
51
+ f"Check username/password of Wyscout source named '{self.name}'. API response "
52
+ f"was '{error['message']}' ({error['code']})."
53
+ )
54
+
55
+ response.raise_for_status()
56
+ return response.json()
57
+
58
+ def _get_paged(self, path: str, data_path: str):
59
+ data = []
60
+ current_page = 1
61
+ page_count = None
62
+ while page_count is None or current_page <= page_count:
63
+ page_data = self._get(path + f"?page={current_page}&limit=100")
64
+ page_count = page_data["meta"]["page_count"]
65
+
66
+ data.extend(page_data[data_path])
67
+ current_page += 1
68
+
69
+ return data
70
+
71
+ def discover_datasets(self, dataset_type: str, season_id: int):
72
+ matches = self._get(f"/seasons/{season_id}/matches")
73
+ datasets = []
74
+ for match in matches["matches"]:
75
+ dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
76
+ datasets.append(dataset)
77
+
78
+ return datasets
79
+
80
+ def fetch_dataset_files(
81
+ self, dataset_type, identifier, current_version
82
+ ) -> Dict[str, Optional[DraftFile]]:
83
+ current_files = current_version.modified_files_map if current_version else {}
84
+ files = {}
85
+
86
+ for filename, url in [
87
+ (
88
+ "events.json",
89
+ f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
90
+ ),
91
+ ]:
92
+ files[filename] = retrieve_http(
93
+ url, current_files.get(filename), auth=(self.username, self.password)
94
+ )
95
+ return files
96
+
97
+
98
+ #
99
+ # class WyscoutEvent(Wyscout):
100
+ # dataset_type = "event"
101
+ #
102
+ # def discover_datasets(self, season_id: int):
103
+ # matches = self._get(f"/seasons/{season_id}/matches")
104
+ # datasets = []
105
+ # for match in matches["matches"]:
106
+ # dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
107
+ # datasets.append(dataset)
108
+ #
109
+ # return datasets
110
+ #
111
+ # def fetch_dataset_files(
112
+ # self, identifier, current_version
113
+ # ) -> Dict[str, Optional[DraftFile]]:
114
+ # current_files = current_version.modified_files_map if current_version else {}
115
+ # files = {}
116
+ #
117
+ # for filename, url in [
118
+ # (
119
+ # "events.json",
120
+ # f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
121
+ # ),
122
+ # ]:
123
+ # files[filename] = retrieve_http(
124
+ # url, current_files.get(filename), auth=(self.username, self.password)
125
+ # )
126
+ # return files
127
+ #
128
+ #
129
+ # class WyscoutPlayer(Wyscout):
130
+ # dataset_type = "player"
131
+ #
132
+ # def discover_datasets(self, season_id: int):
133
+ # return [
134
+ # dict(
135
+ # version="v3",
136
+ # )
137
+ # ]
138
+ #
139
+ # def fetch_dataset_files(
140
+ # self, identifier, current_version
141
+ # ) -> Dict[str, Optional[DraftFile]]:
142
+ # current_files = current_version.modified_files_map if current_version else {}
143
+ #
144
+ # return {
145
+ # "players.json": retrieve_http(
146
+ # f"{BASE_URL}/seasons/{identifier.season_id}/players?limit=100",
147
+ # current_files.get("players.json"),
148
+ # pager=("players", wyscout_pager_fn),
149
+ # auth=(self.username, self.password),
150
+ # )
151
+ # }
152
+
153
+
154
+ if __name__ == "__main__":
155
+ import dotenv, os
156
+
157
+ dotenv.load_dotenv()
158
+
159
+ kilmarnock_id = 8516
160
+ competition_id = 750
161
+ season_id = 188105
162
+ match_id = 5459107
163
+ player_id = 840543
164
+
165
+ data = requests.get(
166
+ f"{BASE_URL}/competitions/{competition_id}/players",
167
+ # f"{BASE_URL}/players/{player_id}/career",
168
+ # f"{BASE_URL}/matches/{match_id}/advancedstats/players",
169
+ # f"{BASE_URL}/competitions/{competition_id}/matches", # teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
170
+ # f"{BASE_URL}/teams/{kilmarnock_id}/squad", #teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
171
+ auth=(os.environ["WYSCOUT_USERNAME"], os.environ["WYSCOUT_PASSWORD"]),
172
+ ).json()
173
+ from pprint import pprint
174
+
175
+ pprint(data)
@@ -0,0 +1,2 @@
1
+ from .dataset import *
2
+ from .file import *
@@ -0,0 +1,2 @@
1
+ from .local_dataset_repository import LocalDatasetRepository
2
+ from .sqlalchemy import SqlAlchemyDatasetRepository
@@ -0,0 +1,73 @@
1
+ import glob
2
+ import os
3
+ import pickle
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from ingestify.domain.models import (
9
+ Dataset,
10
+ DatasetCollection,
11
+ DatasetRepository,
12
+ Selector,
13
+ )
14
+
15
+
16
+ def parse_value(v):
17
+ try:
18
+ return int(v)
19
+ except ValueError:
20
+ return v
21
+
22
+
23
+ class LocalDatasetRepository(DatasetRepository):
24
+ def destroy(self, dataset: Dataset):
25
+ path = (
26
+ self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
27
+ )
28
+ path.unlink()
29
+
30
+ @classmethod
31
+ def supports(cls, url: str) -> bool:
32
+ return url.startswith("file://")
33
+
34
+ def __init__(self, url: str):
35
+ self.base_dir = Path(url[7:])
36
+ raise DeprecationWarning(
37
+ "This Repository should not be used. Better use SqlAlchemyDatasetRepository with a local sqlite database."
38
+ )
39
+
40
+ def get_dataset_collection(
41
+ self,
42
+ dataset_type: Optional[str] = None,
43
+ provider: Optional[str] = None,
44
+ dataset_id: Optional[str] = None,
45
+ selector: Optional[Selector] = None,
46
+ **kwargs
47
+ ) -> DatasetCollection:
48
+
49
+ datasets = []
50
+ for dir_name in glob.glob(str(self.base_dir / "*")):
51
+ attributes = {
52
+ item[0]: parse_value(item[1])
53
+ for item in [
54
+ part.split("=") for part in os.path.basename(dir_name).split("__")
55
+ ]
56
+ }
57
+ if not selector or selector.matches(attributes):
58
+ with open(dir_name + "/dataset.pickle", "rb") as fp:
59
+ dataset = pickle.load(fp)
60
+ datasets.append(dataset)
61
+ return DatasetCollection(datasets)
62
+
63
+ def save(self, bucket: str, dataset: Dataset):
64
+ path = (
65
+ self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
66
+ )
67
+ path.parent.mkdir(parents=True, exist_ok=True)
68
+
69
+ with open(path, "wb") as fp:
70
+ pickle.dump(dataset, fp)
71
+
72
+ def next_identity(self):
73
+ return str(uuid.uuid4())
@@ -0,0 +1 @@
1
+ from .repository import SqlAlchemyDatasetRepository
@@ -0,0 +1,153 @@
1
+ import datetime
2
+ from pathlib import Path
3
+
4
+ from sqlalchemy import (
5
+ JSON,
6
+ BigInteger,
7
+ Column,
8
+ DateTime,
9
+ ForeignKey,
10
+ ForeignKeyConstraint,
11
+ Integer,
12
+ MetaData,
13
+ String,
14
+ Table,
15
+ TypeDecorator,
16
+ )
17
+ from sqlalchemy.orm import registry, relationship
18
+
19
+ from ingestify.domain.models import Dataset, File, Revision
20
+ from ingestify.domain.models.dataset.dataset import DatasetState
21
+
22
+
23
+ class TZDateTime(TypeDecorator):
24
+ impl = DateTime
25
+ LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
26
+ cache_ok = True
27
+
28
+ def process_bind_param(self, value: datetime, dialect):
29
+ if value.tzinfo is None:
30
+ value = value.astimezone(self.LOCAL_TIMEZONE)
31
+
32
+ return value.astimezone(datetime.timezone.utc)
33
+
34
+ def process_result_value(self, value, dialect):
35
+ if not value:
36
+ return value
37
+
38
+ if value.tzinfo is None:
39
+ return value.replace(tzinfo=datetime.timezone.utc)
40
+
41
+ return value.astimezone(datetime.timezone.utc)
42
+
43
+
44
+ class PathString(TypeDecorator):
45
+ impl = String(255)
46
+
47
+ def process_bind_param(self, value: Path, dialect):
48
+ return str(value)
49
+
50
+ def process_result_value(self, value, dialect):
51
+ if not value:
52
+ return value
53
+
54
+ return Path(value)
55
+
56
+
57
+ class DatasetStateString(TypeDecorator):
58
+ impl = String(255)
59
+
60
+ def process_bind_param(self, value: DatasetState, dialect):
61
+ return value.value
62
+
63
+ def process_result_value(self, value, dialect):
64
+ if not value:
65
+ return value
66
+
67
+ return DatasetState[value]
68
+
69
+
70
+ mapper_registry = registry()
71
+
72
+ metadata = MetaData()
73
+
74
+ dataset_table = Table(
75
+ "dataset",
76
+ metadata,
77
+ Column("bucket", String(255), default=None),
78
+ Column("dataset_id", String(255), primary_key=True),
79
+ Column("provider", String(255)),
80
+ Column("dataset_type", String(255)),
81
+ Column("state", DatasetStateString),
82
+ Column("name", String(255)),
83
+ Column("identifier", JSON),
84
+ Column("metadata", JSON),
85
+ Column("created_at", TZDateTime(6)),
86
+ Column("updated_at", TZDateTime(6)),
87
+ )
88
+
89
+ revision_table = Table(
90
+ "revision",
91
+ metadata,
92
+ Column(
93
+ "dataset_id", String(255), ForeignKey("dataset.dataset_id"), primary_key=True
94
+ ),
95
+ Column("revision_id", Integer, primary_key=True),
96
+ Column("description", String(255)),
97
+ Column("created_at", TZDateTime(6)),
98
+ )
99
+ file_table = Table(
100
+ "file",
101
+ metadata,
102
+ Column("dataset_id", String(255), primary_key=True),
103
+ Column("revision_id", Integer, primary_key=True),
104
+ Column("file_id", String(255), primary_key=True),
105
+ Column("created_at", TZDateTime(6)),
106
+ Column("modified_at", TZDateTime(6)),
107
+ Column("tag", String(255)),
108
+ Column("content_type", String(255)),
109
+ Column("size", BigInteger),
110
+ Column("data_feed_key", String(255)),
111
+ Column("data_spec_version", String(255)),
112
+ Column("data_serialization_format", String(255)),
113
+ Column("storage_compression_method", String(255)),
114
+ Column("storage_size", BigInteger),
115
+ Column("storage_path", PathString),
116
+ ForeignKeyConstraint(
117
+ ("dataset_id", "revision_id"),
118
+ [revision_table.c.dataset_id, revision_table.c.revision_id],
119
+ ondelete="CASCADE",
120
+ ),
121
+ )
122
+
123
+
124
+ mapper_registry.map_imperatively(
125
+ Dataset,
126
+ dataset_table,
127
+ properties={
128
+ "revisions": relationship(
129
+ Revision,
130
+ backref="dataset",
131
+ order_by=revision_table.c.revision_id,
132
+ lazy="joined",
133
+ cascade="all, delete-orphan",
134
+ ),
135
+ },
136
+ )
137
+
138
+ mapper_registry.map_imperatively(
139
+ Revision,
140
+ revision_table,
141
+ properties={
142
+ "modified_files": relationship(
143
+ File,
144
+ order_by=file_table.c.file_id,
145
+ primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
146
+ lazy="joined",
147
+ cascade="all, delete-orphan",
148
+ )
149
+ },
150
+ )
151
+
152
+
153
+ mapper_registry.map_imperatively(File, file_table)