ingestify 0.6.0__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {ingestify-0.6.0 → ingestify-0.6.2}/PKG-INFO +1 -1
  2. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/__init__.py +1 -1
  3. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/application/dataset_store.py +19 -5
  4. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/collection_metadata.py +3 -1
  5. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/dataset.py +17 -1
  6. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/file.py +6 -1
  7. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/revision.py +11 -2
  8. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/selector.py +11 -1
  9. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/fetch_policy.py +3 -1
  10. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_job.py +70 -55
  11. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py +5 -0
  12. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/fetch/http.py +14 -6
  13. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/dataset/sqlalchemy/repository.py +14 -9
  14. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify.egg-info/PKG-INFO +1 -1
  15. {ingestify-0.6.0 → ingestify-0.6.2}/README.md +0 -0
  16. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/application/__init__.py +0 -0
  17. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/application/ingestion_engine.py +0 -0
  18. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/application/loader.py +0 -0
  19. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/application/secrets_manager.py +0 -0
  20. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/cmdline.py +0 -0
  21. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/__init__.py +0 -0
  22. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/__init__.py +0 -0
  23. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/base.py +0 -0
  24. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  25. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/__init__.py +0 -0
  26. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/collection.py +0 -0
  27. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  28. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  29. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/events.py +0 -0
  30. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/file_collection.py +0 -0
  31. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/file_repository.py +0 -0
  32. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/dataset/identifier.py +0 -0
  33. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/__init__.py +0 -0
  34. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/_old_event.py +0 -0
  35. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/dispatcher.py +0 -0
  36. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/domain_event.py +0 -0
  37. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/event_bus.py +0 -0
  38. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/publisher.py +0 -0
  39. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/event/subscriber.py +0 -0
  40. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/ingestion/__init__.py +0 -0
  41. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  42. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/resources/__init__.py +0 -0
  43. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  44. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/sink.py +0 -0
  45. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/source.py +0 -0
  46. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/task/__init__.py +0 -0
  47. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/task/set.py +0 -0
  48. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/task/task.py +0 -0
  49. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/task/task_summary.py +0 -0
  50. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/models/timing.py +0 -0
  51. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/services/__init__.py +0 -0
  52. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  53. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/services/transformers/__init__.py +0 -0
  54. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  55. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/exceptions.py +0 -0
  56. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/__init__.py +0 -0
  57. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/fetch/__init__.py +0 -0
  58. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/serialization/__init__.py +0 -0
  59. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/sink/__init__.py +0 -0
  60. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/sink/postgresql.py +0 -0
  61. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/source/__init__.py +0 -0
  62. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/source/statsbomb_github.py +0 -0
  63. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/source/wyscout.py +0 -0
  64. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/__init__.py +0 -0
  65. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/dataset/__init__.py +0 -0
  66. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  67. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
  68. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/file/__init__.py +0 -0
  69. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  70. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/file/local_file_repository.py +0 -0
  71. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  72. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/main.py +0 -0
  73. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/server.py +0 -0
  74. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/source_base.py +0 -0
  75. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  76. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  77. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  78. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  79. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/wyscout/.env +0 -0
  80. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/wyscout/.gitignore +0 -0
  81. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/wyscout/README.md +0 -0
  82. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  83. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/wyscout/database/README.md +0 -0
  84. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/static/templates/wyscout/query.py +0 -0
  85. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify/utils.py +0 -0
  86. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify.egg-info/SOURCES.txt +0 -0
  87. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify.egg-info/dependency_links.txt +0 -0
  88. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify.egg-info/entry_points.txt +0 -0
  89. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify.egg-info/requires.txt +0 -0
  90. {ingestify-0.6.0 → ingestify-0.6.2}/ingestify.egg-info/top_level.txt +0 -0
  91. {ingestify-0.6.0 → ingestify-0.6.2}/setup.cfg +0 -0
  92. {ingestify-0.6.0 → ingestify-0.6.2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.6.0"
11
+ __version__ = "0.6.2"
@@ -11,6 +11,7 @@ from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
11
11
 
12
12
  from ingestify.domain.models.dataset.dataset import DatasetState
13
13
  from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
14
+ from ingestify.domain.models.dataset.file import NotModifiedFile
14
15
  from ingestify.domain.models.dataset.file_collection import FileCollection
15
16
  from ingestify.domain.models.dataset.revision import RevisionSource
16
17
  from ingestify.domain.models.event import EventBus
@@ -140,8 +141,8 @@ class DatasetStore:
140
141
  current_revision = dataset.current_revision
141
142
 
142
143
  for file_id, file_ in modified_files.items():
143
- if file_ is None:
144
- # It's always allowed to pass None as file. This means it didn't change and must be ignored.
144
+ if isinstance(file_, NotModifiedFile):
145
+ # It's always allowed to pass NotModifiedFile as file. This means it didn't change and must be ignored.
145
146
  continue
146
147
 
147
148
  current_file = (
@@ -210,9 +211,22 @@ class DatasetStore:
210
211
  f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
211
212
  )
212
213
  else:
213
- logger.info(
214
- f"Ignoring a new revision without changed files -> {dataset.identifier}"
215
- )
214
+ if dataset.update_last_modified(files):
215
+ # For some Datasets the last modified doesn't make sense (for sources that don't provide it)
216
+ # Do we want to update last modified of a Dataset when the value is utcnow()?
217
+ # self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
218
+ # TODO: dispatch some event?
219
+ # self.dispatch(DatasetLastModifiedChanged(dataset=dataset))
220
+ logger.info(
221
+ f"Ignoring a new revision without changed files -> {dataset.identifier}, but "
222
+ f"might need to update last modified to {dataset.last_modified_at} ?"
223
+ )
224
+
225
+ else:
226
+ logger.info(
227
+ f"Ignoring a new revision without changed files -> {dataset.identifier}"
228
+ )
229
+
216
230
  revision = None
217
231
 
218
232
  return revision
@@ -11,4 +11,6 @@ class DatasetCollectionMetadata:
11
11
 
12
12
  # Use the last modified to only retrieve datasets that are changed
13
13
  last_modified: Optional[datetime]
14
- row_count: int
14
+
15
+ # Not really used
16
+ row_count: Optional[int] = None
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
- from typing import List, Optional
3
+ from typing import List, Optional, Dict
4
4
  from pydantic import Field, field_validator
5
5
 
6
6
  from ingestify.utils import utcnow
@@ -52,6 +52,22 @@ class Dataset(BaseModel):
52
52
  else:
53
53
  self.last_modified_at = revision.last_modified_at
54
54
 
55
+ def update_last_modified(self, files: Dict[str, DraftFile]):
56
+ """Update the last modified, even tho there was no new revision. Some Sources
57
+ may report a Dataset is changed, even when there are no changed files.
58
+ Update the last_modified to prevent hitting the same Source for updates
59
+ """
60
+ changed = False
61
+ for file in files.values():
62
+ if file.modified_at and (
63
+ self.last_modified_at is None
64
+ or file.modified_at > self.last_modified_at
65
+ ):
66
+ # Update, and continue looking for others
67
+ self.last_modified_at = file.modified_at
68
+ changed = True
69
+ return changed
70
+
55
71
  def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
56
72
  changed = False
57
73
  if self.name != name:
@@ -29,7 +29,7 @@ class DraftFile(BaseModel):
29
29
  modified_at: Optional[datetime] = None,
30
30
  ):
31
31
  # Pass-through for these types
32
- if isinstance(file_, DraftFile) or file_ is None:
32
+ if isinstance(file_, (DraftFile, NotModifiedFile)):
33
33
  return file_
34
34
  elif isinstance(file_, str):
35
35
  stream = BytesIO(file_.encode("utf-8"))
@@ -102,6 +102,11 @@ class File(BaseModel):
102
102
  )
103
103
 
104
104
 
105
+ class NotModifiedFile(BaseModel):
106
+ modified_at: datetime
107
+ reason: str
108
+
109
+
105
110
  class LoadedFile(BaseModel):
106
111
  # Unique key to identify this File within a Dataset
107
112
  file_id: str
@@ -44,13 +44,22 @@ class Revision(BaseModel):
44
44
  def modified_files_map(self) -> Dict[str, File]:
45
45
  return {file.file_id: file for file in self.modified_files}
46
46
 
47
- def is_changed(self, files: Dict[str, datetime]) -> bool:
47
+ def is_changed(
48
+ self, files: Dict[str, datetime], dataset_last_modified_at: datetime
49
+ ) -> bool:
48
50
  modified_files_map = self.modified_files_map
49
51
  for file_id, last_modified in files.items():
50
52
  if file_id not in modified_files_map:
51
53
  return True
52
54
 
53
55
  if modified_files_map[file_id].modified_at < last_modified:
54
- return True
56
+ if dataset_last_modified_at < last_modified:
57
+ # For StatsBomb we use last_modified of match for lineups, and events files.
58
+ # When only match is updated, the lineups and events files won't be updated
59
+ # as the content is not changed. Therefore, those modified_at is not updated,
60
+ # and we try to update it over and over again.
61
+ # This check prevents that; always take the LastModifiedAt of the Dataset
62
+ # into account
63
+ return True
55
64
 
56
65
  return False
@@ -1,3 +1,6 @@
1
+ from datetime import datetime
2
+ from typing import Optional
3
+
1
4
  from ingestify.domain.models.data_spec_version_collection import (
2
5
  DataSpecVersionCollection,
3
6
  )
@@ -28,10 +31,17 @@ class Selector(AttributeBag):
28
31
  def data_spec_versions(self):
29
32
  return self._data_spec_versions
30
33
 
34
+ @property
35
+ def last_modified(self) -> Optional[datetime]:
36
+ try:
37
+ return self._last_modified
38
+ except AttributeError:
39
+ return None
40
+
31
41
  @property
32
42
  def custom_attributes(self):
33
43
  return {
34
44
  k: v
35
45
  for k, v in self.items()
36
- if k not in ("_matcher", "_data_spec_versions")
46
+ if k not in ("_matcher", "_data_spec_versions", "_last_modified")
37
47
  }
@@ -26,7 +26,9 @@ class FetchPolicy:
26
26
  file.file_id: file.last_modified
27
27
  for file in dataset_resource.files.values()
28
28
  }
29
- if current_revision.is_changed(files_last_modified):
29
+ if current_revision.is_changed(
30
+ files_last_modified, dataset.last_modified_at
31
+ ):
30
32
  return True
31
33
 
32
34
  # We don't set last_modified on Dataset level anymore, only on file level
@@ -3,11 +3,12 @@ import json
3
3
  import logging
4
4
  import uuid
5
5
  from enum import Enum
6
- from typing import Optional, Iterator
6
+ from typing import Optional, Iterator, Union
7
7
 
8
8
  from ingestify import retrieve_http
9
9
  from ingestify.application.dataset_store import DatasetStore
10
10
  from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
11
+ from ingestify.domain.models.dataset.file import NotModifiedFile
11
12
  from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
12
13
  from ingestify.domain.models.ingestion.ingestion_job_summary import (
13
14
  IngestionJobSummary,
@@ -54,7 +55,7 @@ def to_batches(input_):
54
55
 
55
56
  def load_file(
56
57
  file_resource: FileResource, dataset: Optional[Dataset] = None
57
- ) -> Optional[DraftFile]:
58
+ ) -> Union[DraftFile, NotModifiedFile]:
58
59
  current_file = None
59
60
  if dataset:
60
61
  current_file = dataset.current_revision.modified_files_map.get(
@@ -72,7 +73,10 @@ def load_file(
72
73
  )
73
74
  if current_file and current_file.tag == file.tag:
74
75
  # Nothing changed
75
- return None
76
+ return NotModifiedFile(
77
+ modified_at=file_resource.last_modified,
78
+ reason="tag matched current_file",
79
+ )
76
80
  return file
77
81
  elif file_resource.url:
78
82
  http_options = {}
@@ -228,6 +232,19 @@ class IngestionJob:
228
232
  ).metadata
229
233
  logger.info(f"Done: {dataset_collection_metadata}")
230
234
 
235
+ if self.selector.last_modified and dataset_collection_metadata.last_modified:
236
+ # This check might fail when the data_spec_versions is changed;
237
+ # missing files are not detected
238
+ if self.selector.last_modified < dataset_collection_metadata.last_modified:
239
+ logger.info(
240
+ f"Skipping find_datasets because selector last_modified "
241
+ f"'{self.selector.last_modified}' < metadata last_modified "
242
+ f"'{dataset_collection_metadata.last_modified}'"
243
+ )
244
+ ingestion_job_summary.set_skipped()
245
+ yield ingestion_job_summary
246
+ return
247
+
231
248
  # There are two different, but similar flows here:
232
249
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
233
250
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
@@ -252,19 +269,18 @@ class IngestionJob:
252
269
 
253
270
  logger.info("Starting tasks")
254
271
 
255
- finish_task_timer = ingestion_job_summary.start_timing("tasks")
256
-
257
272
  while True:
258
273
  logger.info(f"Finding next batch of datasets for selector={self.selector}")
274
+
259
275
  try:
260
276
  with ingestion_job_summary.record_timing("find_datasets"):
261
- batch = next(batches)
262
- except StopIteration:
263
- break
277
+ try:
278
+ batch = next(batches)
279
+ except StopIteration:
280
+ break
264
281
  except Exception as e:
265
282
  logger.exception("Failed to fetch next batch")
266
283
 
267
- finish_task_timer()
268
284
  ingestion_job_summary.set_exception(e)
269
285
  yield ingestion_job_summary
270
286
  return
@@ -292,54 +308,57 @@ class IngestionJob:
292
308
  skipped_tasks = 0
293
309
 
294
310
  task_set = TaskSet()
295
- for dataset_resource in batch:
296
- dataset_identifier = Identifier.create_from_selector(
297
- self.selector, **dataset_resource.dataset_resource_id
298
- )
299
311
 
300
- if dataset := dataset_collection.get(dataset_identifier):
301
- if self.ingestion_plan.fetch_policy.should_refetch(
302
- dataset, dataset_resource
303
- ):
304
- task_set.add(
305
- UpdateDatasetTask(
306
- dataset=dataset, # Current dataset from the database
307
- dataset_resource=dataset_resource, # Most recent dataset_resource
308
- store=store,
312
+ with ingestion_job_summary.record_timing("build_task_set"):
313
+ for dataset_resource in batch:
314
+ dataset_identifier = Identifier.create_from_selector(
315
+ self.selector, **dataset_resource.dataset_resource_id
316
+ )
317
+
318
+ if dataset := dataset_collection.get(dataset_identifier):
319
+ if self.ingestion_plan.fetch_policy.should_refetch(
320
+ dataset, dataset_resource
321
+ ):
322
+ task_set.add(
323
+ UpdateDatasetTask(
324
+ dataset=dataset, # Current dataset from the database
325
+ dataset_resource=dataset_resource, # Most recent dataset_resource
326
+ store=store,
327
+ )
309
328
  )
310
- )
329
+ else:
330
+ skipped_tasks += 1
311
331
  else:
312
- skipped_tasks += 1
313
- else:
314
- if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
315
- task_set.add(
316
- CreateDatasetTask(
317
- dataset_resource=dataset_resource,
318
- store=store,
332
+ if self.ingestion_plan.fetch_policy.should_fetch(
333
+ dataset_resource
334
+ ):
335
+ task_set.add(
336
+ CreateDatasetTask(
337
+ dataset_resource=dataset_resource,
338
+ store=store,
339
+ )
319
340
  )
320
- )
321
- else:
322
- skipped_tasks += 1
323
-
324
- if task_set:
325
- logger.info(
326
- f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
327
- f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
328
- )
329
- logger.info(f"Running {len(task_set)} tasks")
330
- ingestion_job_summary.add_task_summaries(
331
- task_executor.run(run_task, task_set)
332
- )
333
- else:
334
- logger.info(
335
- f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
336
- f"using selector {self.selector} => nothing to do"
337
- )
338
-
339
- ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
341
+ else:
342
+ skipped_tasks += 1
343
+
344
+ with ingestion_job_summary.record_timing("tasks"):
345
+ if task_set:
346
+ logger.info(
347
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
348
+ f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
349
+ )
350
+ logger.info(f"Running {len(task_set)} tasks")
351
+ ingestion_job_summary.add_task_summaries(
352
+ task_executor.run(run_task, task_set)
353
+ )
354
+ else:
355
+ logger.info(
356
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
357
+ f"using selector {self.selector} => nothing to do"
358
+ )
359
+ ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
340
360
 
341
361
  if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
342
- finish_task_timer()
343
362
  ingestion_job_summary.set_finished()
344
363
  yield ingestion_job_summary
345
364
 
@@ -347,11 +366,7 @@ class IngestionJob:
347
366
  is_first_chunk = False
348
367
  ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
349
368
 
350
- # We will resume tasks, start timer right away
351
- finish_task_timer = ingestion_job_summary.start_timing("tasks")
352
-
353
369
  if ingestion_job_summary.task_count() > 0 or is_first_chunk:
354
370
  # When there is interesting information to store, or there was no data at all, store it
355
- finish_task_timer()
356
371
  ingestion_job_summary.set_finished()
357
372
  yield ingestion_job_summary
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
18
18
  class IngestionJobState(str, Enum):
19
19
  RUNNING = "RUNNING"
20
20
  FINISHED = "FINISHED"
21
+ SKIPPED = "SKIPPED"
21
22
  FAILED = "FAILED"
22
23
 
23
24
 
@@ -104,6 +105,10 @@ class IngestionJobSummary(BaseModel, HasTiming):
104
105
  self.state = IngestionJobState.FAILED
105
106
  self._set_ended()
106
107
 
108
+ def set_skipped(self):
109
+ self.state = IngestionJobState.SKIPPED
110
+ self._set_ended()
111
+
107
112
  @property
108
113
  def duration(self) -> timedelta:
109
114
  return self.ended_at - self.started_at
@@ -3,13 +3,14 @@ from datetime import datetime
3
3
  from email.utils import format_datetime, parsedate
4
4
  from hashlib import sha1
5
5
  from io import BytesIO
6
- from typing import Optional, Callable, Tuple
6
+ from typing import Optional, Callable, Tuple, Union
7
7
 
8
8
  import requests
9
9
  from requests.adapters import HTTPAdapter
10
10
  from urllib3 import Retry
11
11
 
12
12
  from ingestify.domain.models import DraftFile, File
13
+ from ingestify.domain.models.dataset.file import NotModifiedFile
13
14
  from ingestify.utils import utcnow
14
15
 
15
16
  _session = None
@@ -46,12 +47,15 @@ def retrieve_http(
46
47
  pager: Optional[Tuple[str, Callable[[str, dict], Optional[str]]]] = None,
47
48
  last_modified: Optional[datetime] = None,
48
49
  **kwargs,
49
- ) -> Optional[DraftFile]:
50
+ ) -> Union[DraftFile, NotModifiedFile]:
50
51
  headers = headers or {}
51
52
  if current_file:
52
53
  if last_modified and current_file.modified_at >= last_modified:
53
54
  # Not changed
54
- return None
55
+ return NotModifiedFile(
56
+ modified_at=last_modified,
57
+ reason=f"last-modified same as current file: {current_file.modified_at} >= {last_modified}",
58
+ )
55
59
  # else:
56
60
  # print(f"{current_file.modified_at=} {last_modified=}")
57
61
  # headers["if-modified-since"] = (
@@ -73,12 +77,14 @@ def retrieve_http(
73
77
 
74
78
  response = get_session().get(url, headers=headers, **http_kwargs)
75
79
  if response.status_code == 404 and ignore_not_found:
76
- return None
80
+ return NotModifiedFile(
81
+ modified_at=last_modified, reason="404 http code and ignore-not-found"
82
+ )
77
83
 
78
84
  response.raise_for_status()
79
85
  if response.status_code == 304:
80
86
  # Not modified
81
- return None
87
+ return NotModifiedFile(modified_at=last_modified, reason="304 http code")
82
88
 
83
89
  if last_modified:
84
90
  # From metadata received from api in discover_datasets
@@ -120,7 +126,9 @@ def retrieve_http(
120
126
 
121
127
  if current_file and current_file.tag == tag:
122
128
  # Not changed. Don't keep it
123
- return None
129
+ return NotModifiedFile(
130
+ modified_at=last_modified, reason="tag matched current_file"
131
+ )
124
132
 
125
133
  return DraftFile(
126
134
  created_at=utcnow(),
@@ -413,20 +413,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
413
413
  else:
414
414
  datasets = []
415
415
 
416
- metadata_result_query = apply_query_filter(
417
- self.session.query(
418
- func.max(dataset_table.c.last_modified_at).label(
419
- "last_modified_at"
420
- ),
421
- func.count().label("row_count"),
416
+ metadata_result_query = (
417
+ apply_query_filter(
418
+ self.session.query(dataset_table.c.last_modified_at)
422
419
  )
420
+ .order_by(dataset_table.c.last_modified_at.desc())
421
+ .limit(1)
423
422
  )
424
423
 
425
424
  self._debug_query(metadata_result_query)
426
425
 
427
- dataset_collection_metadata = DatasetCollectionMetadata(
428
- *metadata_result_query.first()
429
- )
426
+ metadata_row = metadata_result_query.first()
427
+ if metadata_row:
428
+ dataset_collection_metadata = DatasetCollectionMetadata(
429
+ last_modified=metadata_row.last_modified_at
430
+ )
431
+ else:
432
+ dataset_collection_metadata = DatasetCollectionMetadata(
433
+ last_modified=None
434
+ )
430
435
 
431
436
  return DatasetCollection(dataset_collection_metadata, datasets)
432
437
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes