ingestify 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +2 -1
- ingestify/application/ingestion_engine.py +3 -0
- ingestify/application/loader.py +12 -2
- ingestify/domain/models/dataset/dataset_state.py +1 -0
- ingestify/domain/models/dataset/file.py +6 -0
- ingestify/domain/models/ingestion/ingestion_job.py +5 -1
- ingestify/domain/models/resources/dataset_resource.py +13 -1
- ingestify/infra/fetch/http.py +3 -3
- ingestify/infra/store/dataset/sqlalchemy/repository.py +90 -50
- ingestify/infra/store/dataset/sqlalchemy/tables.py +191 -174
- ingestify/main.py +189 -5
- ingestify/tests/__init__.py +0 -0
- ingestify/tests/conftest.py +17 -0
- ingestify/tests/test_auto_ingest.py +418 -0
- ingestify/tests/test_engine.py +501 -0
- ingestify/tests/test_events.py +201 -0
- ingestify/tests/test_file_cache.py +98 -0
- ingestify/tests/test_pagination.py +162 -0
- ingestify/tests/test_store_version.py +73 -0
- ingestify/tests/test_table_prefix.py +78 -0
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/METADATA +59 -5
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/RECORD +25 -16
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/WHEEL +1 -1
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from unittest.mock import patch
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from ingestify.main import get_engine
|
|
7
|
+
from ingestify.domain import Dataset, Identifier, Revision, File
|
|
8
|
+
from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_file_cache(config_file):
|
|
12
|
+
"""Test file caching with the with_file_cache context manager."""
|
|
13
|
+
# Get engine from the fixture
|
|
14
|
+
engine = get_engine(config_file, "main")
|
|
15
|
+
store = engine.store
|
|
16
|
+
|
|
17
|
+
# Create a timestamp for test data
|
|
18
|
+
now = datetime.now(timezone.utc)
|
|
19
|
+
|
|
20
|
+
# Create a test file
|
|
21
|
+
test_file = File(
|
|
22
|
+
file_id="test_file_id",
|
|
23
|
+
data_feed_key="test_file",
|
|
24
|
+
tag="test_tag",
|
|
25
|
+
data_serialization_format="txt",
|
|
26
|
+
storage_path="test/path",
|
|
27
|
+
storage_size=100,
|
|
28
|
+
storage_compression_method="none",
|
|
29
|
+
created_at=now,
|
|
30
|
+
modified_at=now,
|
|
31
|
+
size=100,
|
|
32
|
+
content_type="text/plain",
|
|
33
|
+
data_spec_version="v1",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Create a test revision with the file
|
|
37
|
+
revision = Revision(
|
|
38
|
+
revision_id=1,
|
|
39
|
+
created_at=now,
|
|
40
|
+
description="Test revision",
|
|
41
|
+
modified_files=[test_file],
|
|
42
|
+
source={"source_type": SourceType.MANUAL, "source_id": "test"},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Create a test dataset with the revision
|
|
46
|
+
dataset = Dataset(
|
|
47
|
+
bucket="test-bucket",
|
|
48
|
+
dataset_id="test-dataset",
|
|
49
|
+
name="Test Dataset",
|
|
50
|
+
state="COMPLETE",
|
|
51
|
+
identifier=Identifier(test_id=1),
|
|
52
|
+
dataset_type="test",
|
|
53
|
+
provider="test-provider",
|
|
54
|
+
metadata={},
|
|
55
|
+
created_at=now,
|
|
56
|
+
updated_at=now,
|
|
57
|
+
last_modified_at=now,
|
|
58
|
+
revisions=[revision],
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Create a simple pass-through reader function to replace the gzip reader
|
|
62
|
+
def simple_reader(stream):
|
|
63
|
+
return stream
|
|
64
|
+
|
|
65
|
+
# Mock both the file repository and the _prepare_read_stream method
|
|
66
|
+
with patch.object(
|
|
67
|
+
store.file_repository, "load_content"
|
|
68
|
+
) as mock_load_content, patch.object(
|
|
69
|
+
store, "_prepare_read_stream"
|
|
70
|
+
) as mock_prepare_read_stream:
|
|
71
|
+
|
|
72
|
+
# Set up the mocks
|
|
73
|
+
mock_load_content.return_value = BytesIO(b"test content")
|
|
74
|
+
mock_prepare_read_stream.return_value = (simple_reader, "")
|
|
75
|
+
|
|
76
|
+
# Test without caching - should load files twice
|
|
77
|
+
store.load_files(dataset)
|
|
78
|
+
store.load_files(dataset)
|
|
79
|
+
|
|
80
|
+
# Should have called load_content twice (without caching)
|
|
81
|
+
assert mock_load_content.call_count == 2
|
|
82
|
+
|
|
83
|
+
# Reset the mock
|
|
84
|
+
mock_load_content.reset_mock()
|
|
85
|
+
|
|
86
|
+
# Test with caching - should load files only once
|
|
87
|
+
with store.with_file_cache():
|
|
88
|
+
store.load_files(dataset)
|
|
89
|
+
store.load_files(dataset)
|
|
90
|
+
|
|
91
|
+
# Should have called load_content only once (with caching)
|
|
92
|
+
assert mock_load_content.call_count == 1
|
|
93
|
+
|
|
94
|
+
# After exiting context, caching should be disabled
|
|
95
|
+
store.load_files(dataset)
|
|
96
|
+
|
|
97
|
+
# Should have called load_content again
|
|
98
|
+
assert mock_load_content.call_count == 2
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
import pytz
|
|
4
|
+
|
|
5
|
+
from ingestify.domain import Dataset, Identifier, DatasetState
|
|
6
|
+
from ingestify.main import get_engine
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_iter_dataset_collection_batches(config_file):
|
|
10
|
+
"""Test iteration over datasets with batches using iter_dataset_collection_batches."""
|
|
11
|
+
# Get engine from the fixture
|
|
12
|
+
engine = get_engine(config_file, "main")
|
|
13
|
+
store = engine.store
|
|
14
|
+
bucket = store.bucket
|
|
15
|
+
|
|
16
|
+
# Create 30 datasets with different creation times
|
|
17
|
+
now = datetime.now(pytz.utc)
|
|
18
|
+
|
|
19
|
+
# Save datasets with ascending created_at timestamps
|
|
20
|
+
for i in range(30):
|
|
21
|
+
dataset = Dataset(
|
|
22
|
+
bucket=bucket,
|
|
23
|
+
dataset_id=f"dataset-{i}",
|
|
24
|
+
name=f"Dataset {i}",
|
|
25
|
+
state="COMPLETE",
|
|
26
|
+
identifier=Identifier(test_id=i),
|
|
27
|
+
dataset_type="test",
|
|
28
|
+
provider="test-provider",
|
|
29
|
+
metadata={},
|
|
30
|
+
created_at=now
|
|
31
|
+
+ timedelta(minutes=i), # Each dataset created 1 minute apart
|
|
32
|
+
updated_at=now + timedelta(minutes=i),
|
|
33
|
+
last_modified_at=now + timedelta(minutes=i),
|
|
34
|
+
)
|
|
35
|
+
store.dataset_repository.save(bucket, dataset)
|
|
36
|
+
|
|
37
|
+
# Test iteration with small batch_size (yields individual datasets)
|
|
38
|
+
dataset_ids = []
|
|
39
|
+
for dataset in store.iter_dataset_collection_batches(
|
|
40
|
+
dataset_type="test",
|
|
41
|
+
provider="test-provider",
|
|
42
|
+
batch_size=5, # Small batch size to force multiple batches
|
|
43
|
+
):
|
|
44
|
+
dataset_ids.append(dataset.dataset_id)
|
|
45
|
+
|
|
46
|
+
# Should get all 30 datasets
|
|
47
|
+
assert len(dataset_ids) == 30
|
|
48
|
+
|
|
49
|
+
# Make sure we have all datasets from 0 to 29
|
|
50
|
+
expected_ids = [f"dataset-{i}" for i in range(30)]
|
|
51
|
+
assert set(dataset_ids) == set(expected_ids)
|
|
52
|
+
|
|
53
|
+
# Test iteration yielding entire DatasetCollection objects
|
|
54
|
+
collections = []
|
|
55
|
+
for collection in store.iter_dataset_collection_batches(
|
|
56
|
+
dataset_type="test",
|
|
57
|
+
provider="test-provider",
|
|
58
|
+
batch_size=5, # Small batch size to force multiple batches
|
|
59
|
+
yield_dataset_collection=True,
|
|
60
|
+
):
|
|
61
|
+
collections.append(collection)
|
|
62
|
+
|
|
63
|
+
# Should have 6 collections (30 datasets / 5 per batch = 6 batches)
|
|
64
|
+
assert len(collections) == 6
|
|
65
|
+
|
|
66
|
+
# Verify total dataset count across all collections
|
|
67
|
+
total_datasets = sum(len(collection) for collection in collections)
|
|
68
|
+
assert total_datasets == 30
|
|
69
|
+
|
|
70
|
+
# Test iteration with a filter that returns fewer results
|
|
71
|
+
filtered_dataset_ids = []
|
|
72
|
+
for dataset in store.iter_dataset_collection_batches(
|
|
73
|
+
dataset_type="test",
|
|
74
|
+
provider="test-provider",
|
|
75
|
+
test_id=5, # Only get dataset with test_id=5
|
|
76
|
+
batch_size=10,
|
|
77
|
+
):
|
|
78
|
+
filtered_dataset_ids.append(dataset.dataset_id)
|
|
79
|
+
|
|
80
|
+
assert len(filtered_dataset_ids) == 1
|
|
81
|
+
assert filtered_dataset_ids[0] == "dataset-5"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_dataset_state_filter(config_file):
|
|
85
|
+
"""Test filtering datasets by state."""
|
|
86
|
+
# Get engine from the fixture
|
|
87
|
+
engine = get_engine(config_file, "main")
|
|
88
|
+
store = engine.store
|
|
89
|
+
bucket = store.bucket
|
|
90
|
+
|
|
91
|
+
now = datetime.now(pytz.utc)
|
|
92
|
+
|
|
93
|
+
# Create datasets with different states
|
|
94
|
+
states = [
|
|
95
|
+
DatasetState.COMPLETE,
|
|
96
|
+
DatasetState.PARTIAL,
|
|
97
|
+
DatasetState.SCHEDULED,
|
|
98
|
+
DatasetState.MISSING,
|
|
99
|
+
]
|
|
100
|
+
for i in range(12): # 3 datasets per state
|
|
101
|
+
state = states[i % 4]
|
|
102
|
+
dataset = Dataset(
|
|
103
|
+
bucket=bucket,
|
|
104
|
+
dataset_id=f"state-test-{i}",
|
|
105
|
+
name=f"State Test {i}",
|
|
106
|
+
state=state,
|
|
107
|
+
identifier=Identifier(test_id=i),
|
|
108
|
+
dataset_type="state-test",
|
|
109
|
+
provider="test-provider",
|
|
110
|
+
metadata={},
|
|
111
|
+
created_at=now + timedelta(minutes=i),
|
|
112
|
+
updated_at=now + timedelta(minutes=i),
|
|
113
|
+
last_modified_at=now + timedelta(minutes=i),
|
|
114
|
+
)
|
|
115
|
+
store.dataset_repository.save(bucket, dataset)
|
|
116
|
+
|
|
117
|
+
# Test filtering by a single state using enum
|
|
118
|
+
complete_datasets = store.get_dataset_collection(
|
|
119
|
+
dataset_type="state-test", dataset_state=DatasetState.COMPLETE
|
|
120
|
+
)
|
|
121
|
+
assert len(complete_datasets) == 3
|
|
122
|
+
|
|
123
|
+
# Test filtering by a single state using string
|
|
124
|
+
partial_datasets = store.get_dataset_collection(
|
|
125
|
+
dataset_type="state-test", dataset_state="PARTIAL"
|
|
126
|
+
)
|
|
127
|
+
assert len(partial_datasets) == 3
|
|
128
|
+
|
|
129
|
+
# Test filtering by multiple states using a list of enums
|
|
130
|
+
mixed_datasets = store.get_dataset_collection(
|
|
131
|
+
dataset_type="state-test",
|
|
132
|
+
dataset_state=[
|
|
133
|
+
DatasetState.COMPLETE,
|
|
134
|
+
DatasetState.SCHEDULED,
|
|
135
|
+
DatasetState.MISSING,
|
|
136
|
+
],
|
|
137
|
+
)
|
|
138
|
+
assert len(mixed_datasets) == 9
|
|
139
|
+
|
|
140
|
+
# Test filtering by multiple states using a list of strings
|
|
141
|
+
mixed_datasets_strings = store.get_dataset_collection(
|
|
142
|
+
dataset_type="state-test", dataset_state=["COMPLETE", "SCHEDULED"]
|
|
143
|
+
)
|
|
144
|
+
assert len(mixed_datasets_strings) == 6
|
|
145
|
+
|
|
146
|
+
# Test case-insensitivity
|
|
147
|
+
lowercase_state_datasets = store.get_dataset_collection(
|
|
148
|
+
dataset_type="state-test", dataset_state="complete"
|
|
149
|
+
)
|
|
150
|
+
assert len(lowercase_state_datasets) == 3
|
|
151
|
+
|
|
152
|
+
# Test with iter_dataset_collection
|
|
153
|
+
scheduled_dataset_ids = []
|
|
154
|
+
for dataset in store.iter_dataset_collection_batches(
|
|
155
|
+
dataset_type="state-test",
|
|
156
|
+
dataset_state=DatasetState.SCHEDULED,
|
|
157
|
+
batch_size=2, # Small batch size to test pagination with filters
|
|
158
|
+
):
|
|
159
|
+
scheduled_dataset_ids.append(dataset.dataset_id)
|
|
160
|
+
assert dataset.state == DatasetState.SCHEDULED
|
|
161
|
+
|
|
162
|
+
assert len(scheduled_dataset_ids) == 3
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from unittest.mock import patch
|
|
3
|
+
|
|
4
|
+
from ingestify.main import get_engine
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_store_version_tracking_new_store(config_file):
|
|
8
|
+
"""Test that a new store gets initialized with the current version."""
|
|
9
|
+
with patch("ingestify.__version__", "1.0.0"):
|
|
10
|
+
engine = get_engine(config_file)
|
|
11
|
+
|
|
12
|
+
# Check that version was stored
|
|
13
|
+
stored_version = engine.store.dataset_repository.get_store_version()
|
|
14
|
+
assert stored_version == "1.0.0"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_store_version_tracking_existing_store_same_version(config_file):
|
|
18
|
+
"""Test that an existing store with same version doesn't cause issues."""
|
|
19
|
+
with patch("ingestify.__version__", "1.0.0"):
|
|
20
|
+
# Initialize store first time
|
|
21
|
+
engine1 = get_engine(config_file)
|
|
22
|
+
store1 = engine1.store
|
|
23
|
+
|
|
24
|
+
# Open store again with same version
|
|
25
|
+
engine2 = get_engine(config_file)
|
|
26
|
+
store2 = engine2.store
|
|
27
|
+
|
|
28
|
+
# Version should still be stored correctly
|
|
29
|
+
stored_version = store2.dataset_repository.get_store_version()
|
|
30
|
+
assert stored_version == "1.0.0"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_store_version_tracking_version_mismatch(config_file, caplog):
|
|
34
|
+
"""Test that version mismatch is logged as warning."""
|
|
35
|
+
# Initialize store with version 1.0.0
|
|
36
|
+
with patch("ingestify.__version__", "1.0.0"):
|
|
37
|
+
engine1 = get_engine(config_file)
|
|
38
|
+
store1 = engine1.store
|
|
39
|
+
|
|
40
|
+
stored_version = store1.dataset_repository.get_store_version()
|
|
41
|
+
assert stored_version == "1.0.0"
|
|
42
|
+
|
|
43
|
+
# Open store with different version
|
|
44
|
+
with patch("ingestify.__version__", "2.0.0"):
|
|
45
|
+
engine2 = get_engine(config_file)
|
|
46
|
+
store2 = engine2.store
|
|
47
|
+
|
|
48
|
+
# Version should still be the original one
|
|
49
|
+
stored_version = store2.dataset_repository.get_store_version()
|
|
50
|
+
assert stored_version == "1.0.0"
|
|
51
|
+
|
|
52
|
+
# Should have logged a warning about version mismatch
|
|
53
|
+
assert "Store version mismatch" in caplog.text
|
|
54
|
+
assert "stored=1.0.0, current=2.0.0" in caplog.text
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_store_version_methods(config_file):
|
|
58
|
+
"""Test the repository version methods directly."""
|
|
59
|
+
engine = get_engine(config_file)
|
|
60
|
+
repo = engine.store.dataset_repository
|
|
61
|
+
|
|
62
|
+
from ingestify import __version__
|
|
63
|
+
|
|
64
|
+
# Initially the real version is stored
|
|
65
|
+
assert repo.get_store_version() == __version__
|
|
66
|
+
|
|
67
|
+
# Set a version
|
|
68
|
+
repo.set_store_version("1.2.3")
|
|
69
|
+
assert repo.get_store_version() == "1.2.3"
|
|
70
|
+
|
|
71
|
+
# Update version
|
|
72
|
+
repo.set_store_version("1.2.4")
|
|
73
|
+
assert repo.get_store_version() == "1.2.4"
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""End-to-end test for table_prefix configuration"""
|
|
3
|
+
import tempfile
|
|
4
|
+
import yaml
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from sqlalchemy import inspect
|
|
7
|
+
|
|
8
|
+
from ingestify.main import get_datastore
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_table_prefix_from_config():
|
|
12
|
+
"""Test that metadata_options.table_prefix is correctly applied from config"""
|
|
13
|
+
temp_dir = Path(tempfile.mkdtemp())
|
|
14
|
+
|
|
15
|
+
# Test 1: Config without metadata_options (default behavior)
|
|
16
|
+
config_no_prefix = {
|
|
17
|
+
"main": {
|
|
18
|
+
"metadata_url": f"sqlite:///{temp_dir / 'no_prefix.db'}",
|
|
19
|
+
"file_url": f"file://{temp_dir / 'files'}",
|
|
20
|
+
"default_bucket": "main",
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
config_path_no_prefix = temp_dir / "config_no_prefix.yaml"
|
|
24
|
+
config_path_no_prefix.write_text(yaml.dump(config_no_prefix))
|
|
25
|
+
|
|
26
|
+
store_no_prefix = get_datastore(str(config_path_no_prefix))
|
|
27
|
+
inspector = inspect(store_no_prefix.dataset_repository.session_provider.engine)
|
|
28
|
+
tables = inspector.get_table_names()
|
|
29
|
+
|
|
30
|
+
assert "dataset" in tables
|
|
31
|
+
assert "revision" in tables
|
|
32
|
+
assert "file" in tables
|
|
33
|
+
assert store_no_prefix.dataset_repository.dataset_table.name == "dataset"
|
|
34
|
+
|
|
35
|
+
# Test 2: Config with metadata_options.table_prefix
|
|
36
|
+
config_with_prefix = {
|
|
37
|
+
"main": {
|
|
38
|
+
"metadata_url": f"sqlite:///{temp_dir / 'with_prefix.db'}",
|
|
39
|
+
"file_url": f"file://{temp_dir / 'files'}",
|
|
40
|
+
"default_bucket": "main",
|
|
41
|
+
"metadata_options": {"table_prefix": "prod_"},
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
config_path_with_prefix = temp_dir / "config_with_prefix.yaml"
|
|
45
|
+
config_path_with_prefix.write_text(yaml.dump(config_with_prefix))
|
|
46
|
+
|
|
47
|
+
store_with_prefix = get_datastore(str(config_path_with_prefix))
|
|
48
|
+
inspector_prefixed = inspect(
|
|
49
|
+
store_with_prefix.dataset_repository.session_provider.engine
|
|
50
|
+
)
|
|
51
|
+
tables_prefixed = inspector_prefixed.get_table_names()
|
|
52
|
+
|
|
53
|
+
assert "prod_dataset" in tables_prefixed
|
|
54
|
+
assert "prod_revision" in tables_prefixed
|
|
55
|
+
assert "prod_file" in tables_prefixed
|
|
56
|
+
assert "prod_ingestion_job_summary" in tables_prefixed
|
|
57
|
+
assert "prod_task_summary" in tables_prefixed
|
|
58
|
+
assert "prod_store_version" in tables_prefixed
|
|
59
|
+
assert store_with_prefix.dataset_repository.dataset_table.name == "prod_dataset"
|
|
60
|
+
|
|
61
|
+
# Verify foreign keys reference prefixed tables
|
|
62
|
+
revision_fks = inspector_prefixed.get_foreign_keys("prod_revision")
|
|
63
|
+
assert revision_fks[0]["referred_table"] == "prod_dataset"
|
|
64
|
+
|
|
65
|
+
file_fks = inspector_prefixed.get_foreign_keys("prod_file")
|
|
66
|
+
assert file_fks[0]["referred_table"] == "prod_revision"
|
|
67
|
+
|
|
68
|
+
task_fks = inspector_prefixed.get_foreign_keys("prod_task_summary")
|
|
69
|
+
assert task_fks[0]["referred_table"] == "prod_ingestion_job_summary"
|
|
70
|
+
|
|
71
|
+
import shutil
|
|
72
|
+
|
|
73
|
+
shutil.rmtree(temp_dir)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
test_table_prefix_from_config()
|
|
78
|
+
print("✓ All tests passed")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestify
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Data Ingestion Framework
|
|
5
5
|
Author: Koen Vossen
|
|
6
6
|
Author-email: info@koenvossen.nl
|
|
@@ -10,12 +10,20 @@ Requires-Dist: requests<3,>=2.0.0
|
|
|
10
10
|
Requires-Dist: SQLAlchemy<3,>=2
|
|
11
11
|
Requires-Dist: click>=8
|
|
12
12
|
Requires-Dist: python-dotenv
|
|
13
|
-
Requires-Dist:
|
|
13
|
+
Requires-Dist: pyaml_env
|
|
14
14
|
Requires-Dist: boto3
|
|
15
15
|
Requires-Dist: pydantic>=2.0.0
|
|
16
16
|
Provides-Extra: test
|
|
17
17
|
Requires-Dist: pytest<7,>=6.2.5; extra == "test"
|
|
18
18
|
Requires-Dist: pytz; extra == "test"
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: author-email
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: license
|
|
24
|
+
Dynamic: provides-extra
|
|
25
|
+
Dynamic: requires-dist
|
|
26
|
+
Dynamic: summary
|
|
19
27
|
|
|
20
28
|
# Ingestify
|
|
21
29
|
|
|
@@ -77,6 +85,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
|
|
|
77
85
|
pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
|
|
78
86
|
```
|
|
79
87
|
|
|
88
|
+
### Developing a new Source
|
|
89
|
+
|
|
90
|
+
When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from ingestify import Source, debug_source
|
|
94
|
+
|
|
95
|
+
class MyCustomSource(Source):
|
|
96
|
+
provider = "my_provider"
|
|
97
|
+
|
|
98
|
+
def __init__(self, name: str, api_key: str):
|
|
99
|
+
super().__init__(name)
|
|
100
|
+
self.api_key = api_key
|
|
101
|
+
|
|
102
|
+
def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
|
|
103
|
+
# Your source implementation
|
|
104
|
+
...
|
|
105
|
+
|
|
106
|
+
# Quick debug - runs full ingestion with temp storage
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
source = MyCustomSource(name="test", api_key="...")
|
|
109
|
+
|
|
110
|
+
debug_source(
|
|
111
|
+
source,
|
|
112
|
+
dataset_type="match",
|
|
113
|
+
data_spec_versions={"events": "v1"},
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The `debug_source()` helper:
|
|
118
|
+
- ✅ Creates an ephemeral dev engine with temp storage
|
|
119
|
+
- ✅ Configures logging automatically
|
|
120
|
+
- ✅ Runs the full ingestion cycle
|
|
121
|
+
- ✅ Shows storage location and results
|
|
122
|
+
|
|
123
|
+
Perfect for testing your source before adding it to production config!
|
|
124
|
+
|
|
80
125
|
### Minimal `config.yaml`
|
|
81
126
|
|
|
82
127
|
```yaml
|
|
@@ -175,8 +220,16 @@ pip install kloppy
|
|
|
175
220
|
```
|
|
176
221
|
|
|
177
222
|
```python
|
|
223
|
+
import logging, sys
|
|
224
|
+
|
|
178
225
|
from ingestify.main import get_engine
|
|
179
226
|
|
|
227
|
+
logging.basicConfig(
|
|
228
|
+
level=logging.INFO,
|
|
229
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
230
|
+
stream=sys.stderr,
|
|
231
|
+
)
|
|
232
|
+
|
|
180
233
|
engine = get_engine(
|
|
181
234
|
metadata_url="sqlite:///database_open_data/catalog.db",
|
|
182
235
|
file_url="file://database_open_data/files/"
|
|
@@ -188,12 +241,13 @@ dataset_iter = engine.iter_datasets(
|
|
|
188
241
|
|
|
189
242
|
provider="statsbomb",
|
|
190
243
|
dataset_type="match",
|
|
191
|
-
competition_id=43,
|
|
192
|
-
season_id=281
|
|
244
|
+
competition_id=43, # "FIFA World Cup"
|
|
245
|
+
#season_id=281
|
|
193
246
|
)
|
|
194
247
|
|
|
195
248
|
for dataset in dataset_iter:
|
|
196
249
|
kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
|
|
250
|
+
logging.info(f"Loaded {kloppy_dataset}")
|
|
197
251
|
```
|
|
198
252
|
|
|
199
253
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=fVzksB6rrJmEm-2P5DvT2JbShYaTZ2nKdbbZf8KabC4,336
|
|
2
2
|
ingestify/cmdline.py,sha256=Rs1_lSKSIJrcygH5fvtOGicOl_e0sZYW7deqp4_jGbY,6233
|
|
3
3
|
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
|
-
ingestify/main.py,sha256=
|
|
4
|
+
ingestify/main.py,sha256=mMXDNzSl1dzN03BUiS97uP3XwFMdgadxP0hJlONsZ_g,15789
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
7
|
ingestify/utils.py,sha256=tsoo-GgeSrwK161WCqW793BAm5bjvnGwI8yGgLTJ1lk,6486
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
ingestify/application/dataset_store.py,sha256=GP6wGjVirefEn6hlqWIkOBqdELad9L_mmTpdHdzj18M,20353
|
|
10
|
-
ingestify/application/ingestion_engine.py,sha256=
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=we16yiDS9QGOlAUiP1vidDycihjWK3B2jo64uqKmrXE,11246
|
|
11
|
+
ingestify/application/loader.py,sha256=K99ZJuHMEJFO6CIlxoyHKGSQtXw63JgOYu3moUD6sR0,13400
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=WuKS34uiR1EwyczKujBHYGupqseJP-U2P5IQS4kpsA8,838
|
|
@@ -23,9 +23,9 @@ ingestify/domain/models/dataset/collection.py,sha256=YKGQv6hqm88MYlNp2c47CoWysyN
|
|
|
23
23
|
ingestify/domain/models/dataset/collection_metadata.py,sha256=aWY6O3_JLj_jKfVfUTjmi3-E4heBmmmtqX81vhdzr0I,498
|
|
24
24
|
ingestify/domain/models/dataset/dataset.py,sha256=OiP03nY0-m06y2GTrs_m-RiZE8HwypIHRwSqoM_DNnQ,4049
|
|
25
25
|
ingestify/domain/models/dataset/dataset_repository.py,sha256=bf3F_1cKw0CvUberD3FMROE8iowAmYefnD4L6aPB39k,989
|
|
26
|
-
ingestify/domain/models/dataset/dataset_state.py,sha256=
|
|
26
|
+
ingestify/domain/models/dataset/dataset_state.py,sha256=AHJSoCXGVJeBe0eyFJMfvLdAZuf82xjdReCyCWZFlSY,348
|
|
27
27
|
ingestify/domain/models/dataset/events.py,sha256=M8jrHWCm9iXapAy3xjvZZtiiOxXDnfefBixiMwkas24,786
|
|
28
|
-
ingestify/domain/models/dataset/file.py,sha256=
|
|
28
|
+
ingestify/domain/models/dataset/file.py,sha256=cXDjSw19HRMCGFpVN4u1oejxE1V8SMQptfNVDVixj6o,4464
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
30
|
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
@@ -39,11 +39,11 @@ ingestify/domain/models/event/event_bus.py,sha256=feVXsbBcRNkbWYvXbmz-Yi9-3R690y
|
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=Ou8v_FXDNnbrzPjHYiLoXMEg7ZRNFPjK1BMk9DY7L2E,15574
|
|
43
43
|
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=7dmkWEvE7lMSv1ILWcSvys1bUGuGe_s-YbOFC6eYMBI,4794
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
|
-
ingestify/domain/models/resources/dataset_resource.py,sha256=
|
|
46
|
+
ingestify/domain/models/resources/dataset_resource.py,sha256=zhTCM4bX6Wf4iWG2g8_SBx2U05YFxTxiZK3f6EqwD6I,3598
|
|
47
47
|
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
48
48
|
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
49
49
|
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
@@ -54,7 +54,7 @@ ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
|
54
54
|
ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
|
|
55
55
|
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
56
56
|
ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
|
-
ingestify/infra/fetch/http.py,sha256=
|
|
57
|
+
ingestify/infra/fetch/http.py,sha256=oaERHk-0Azu3T2-r5gHPuC9qvmD4cPURGY02q3GoI00,4647
|
|
58
58
|
ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVUbm_sg9GWx9eI,702
|
|
59
59
|
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
@@ -66,14 +66,23 @@ ingestify/infra/source/statsbomb/match.py,sha256=8Zpdys6-bB_ral2AmjGKhF4BnXW3F0Y
|
|
|
66
66
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
67
67
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
68
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
69
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=
|
|
70
|
-
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=
|
|
69
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=6WPHKxA6UhgzO3P4Sdbq7W14tTMxisC8Js3MZilPoNc,24160
|
|
70
|
+
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=1FewCsN7jdk1ITzL_neOwJWtHD03NxCS9E8dhZcz4oY,12236
|
|
71
71
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
72
72
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
73
73
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
74
74
|
ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
|
|
75
|
-
ingestify
|
|
76
|
-
ingestify
|
|
77
|
-
ingestify
|
|
78
|
-
ingestify
|
|
79
|
-
ingestify
|
|
75
|
+
ingestify/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
|
+
ingestify/tests/conftest.py,sha256=Cr768nLMWUfIP6FMR7aFhUCY4uQ9Tz_bXOq0udpiDEE,411
|
|
77
|
+
ingestify/tests/test_auto_ingest.py,sha256=coMOzJBTbeDwUSYDLnqmkaCXpG-6WQeKqf_nj9XFiA4,14502
|
|
78
|
+
ingestify/tests/test_engine.py,sha256=x3_o6M3satos029Er84ptbzxRKoBw5KB0am2KJSQ16Q,15738
|
|
79
|
+
ingestify/tests/test_events.py,sha256=A1f8H4HRyn52SWo3wV_MgSeb6IbT_lNi9wWAK8EGsK4,7806
|
|
80
|
+
ingestify/tests/test_file_cache.py,sha256=Xbh_VLLDH-KQXE3MeujDeOjjYYbAnjGR6wsHwMInKco,3049
|
|
81
|
+
ingestify/tests/test_pagination.py,sha256=uAKDMsM6fYSa4NcAlXDllu2y-8lnh0AclhPZ5MWJKn8,5539
|
|
82
|
+
ingestify/tests/test_store_version.py,sha256=4czUG8LtaGxgjW4trw7BzYJA8blQp3-HM8w-7HjqFl0,2508
|
|
83
|
+
ingestify/tests/test_table_prefix.py,sha256=6N42T6hfulqTlsUlrwhNmZ-TK-ZOt4U8Jx9NxKyLS4I,2844
|
|
84
|
+
ingestify-0.9.0.dist-info/METADATA,sha256=1j8178-ZiJrZb3CkWSiKU4rlBfRnmSMdAaVytjrKc9w,8263
|
|
85
|
+
ingestify-0.9.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
86
|
+
ingestify-0.9.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
87
|
+
ingestify-0.9.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
88
|
+
ingestify-0.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|