udata 8.0.2.dev29253__py2.py3-none-any.whl → 8.0.2.dev29263__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (29) hide show
  1. udata/harvest/backends/base.py +93 -103
  2. udata/harvest/backends/dcat.py +51 -82
  3. udata/harvest/tasks.py +3 -13
  4. udata/harvest/tests/factories.py +13 -6
  5. udata/harvest/tests/test_actions.py +2 -2
  6. udata/harvest/tests/test_base_backend.py +9 -5
  7. udata/harvest/tests/test_dcat_backend.py +13 -0
  8. udata/static/chunks/{11.7266fef2dddc1db403d9.js → 11.ae54612e36c6d46f85db.js} +3 -3
  9. udata/static/chunks/{11.7266fef2dddc1db403d9.js.map → 11.ae54612e36c6d46f85db.js.map} +1 -1
  10. udata/static/chunks/{13.91b177d7d531fd55cf5d.js → 13.d8ccb992a49875966313.js} +2 -2
  11. udata/static/chunks/{13.91b177d7d531fd55cf5d.js.map → 13.d8ccb992a49875966313.js.map} +1 -1
  12. udata/static/chunks/{16.e866757bab9f6b0a3f1b.js → 16.4565605e68bab129a471.js} +2 -2
  13. udata/static/chunks/{16.e866757bab9f6b0a3f1b.js.map → 16.4565605e68bab129a471.js.map} +1 -1
  14. udata/static/chunks/{19.619b83ac597516dcd03e.js → 19.f993a75d5bfe2382548d.js} +3 -3
  15. udata/static/chunks/{19.619b83ac597516dcd03e.js.map → 19.f993a75d5bfe2382548d.js.map} +1 -1
  16. udata/static/chunks/{5.48417db6b33328fa9d6a.js → 5.cc2e7bf65ef32f9c8604.js} +3 -3
  17. udata/static/chunks/{5.48417db6b33328fa9d6a.js.map → 5.cc2e7bf65ef32f9c8604.js.map} +1 -1
  18. udata/static/chunks/{6.f84539bd4c419b36cc19.js → 6.cad898a38692eda28965.js} +3 -3
  19. udata/static/chunks/{6.f84539bd4c419b36cc19.js.map → 6.cad898a38692eda28965.js.map} +1 -1
  20. udata/static/chunks/{9.07503e7f7ec02919f696.js → 9.d5b992e9ef51921aeb57.js} +2 -2
  21. udata/static/chunks/{9.07503e7f7ec02919f696.js.map → 9.d5b992e9ef51921aeb57.js.map} +1 -1
  22. udata/static/common.js +1 -1
  23. udata/static/common.js.map +1 -1
  24. {udata-8.0.2.dev29253.dist-info → udata-8.0.2.dev29263.dist-info}/METADATA +2 -2
  25. {udata-8.0.2.dev29253.dist-info → udata-8.0.2.dev29263.dist-info}/RECORD +29 -29
  26. {udata-8.0.2.dev29253.dist-info → udata-8.0.2.dev29263.dist-info}/LICENSE +0 -0
  27. {udata-8.0.2.dev29253.dist-info → udata-8.0.2.dev29263.dist-info}/WHEEL +0 -0
  28. {udata-8.0.2.dev29253.dist-info → udata-8.0.2.dev29263.dist-info}/entry_points.txt +0 -0
  29. {udata-8.0.2.dev29253.dist-info → udata-8.0.2.dev29263.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import logging
2
2
  import traceback
3
3
 
4
4
  from datetime import datetime, date, timedelta
5
+ from typing import Optional
5
6
  from uuid import UUID
6
7
 
7
8
  import requests
@@ -68,7 +69,12 @@ class HarvestFeature(object):
68
69
 
69
70
 
70
71
  class BaseBackend(object):
71
- '''Base class for Harvester implementations'''
72
+ """
73
+ Base class that wrap children methods to add error management and debug logs.
74
+ Also provides a few helpers needed on all or some backends.
75
+
76
+
77
+ """
72
78
 
73
79
  name = None
74
80
  display_name = None
@@ -127,92 +133,69 @@ class BaseBackend(object):
127
133
  def get_filters(self):
128
134
  return self.config.get('filters', [])
129
135
 
130
- def harvest(self):
131
- '''Start the harvesting process'''
132
- if self.perform_initialization() is not None:
133
- self.process_items()
134
- self.finalize()
135
- return self.job
136
+ def inner_harvest(self):
137
+ raise NotImplementedError
138
+
139
+ def inner_process_dataset(self, item: HarvestItem) -> Dataset:
140
+ raise NotImplementedError
136
141
 
137
- def perform_initialization(self):
138
- '''Initialize the harvesting for a given job'''
139
- log.debug('Initializing backend')
142
+ def harvest(self):
143
+ log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…')
140
144
  factory = HarvestJob if self.dryrun else HarvestJob.objects.create
141
- self.job = factory(status='initializing',
145
+ self.job = factory(status='initialized',
142
146
  started=datetime.utcnow(),
143
147
  source=self.source)
144
148
 
145
149
  before_harvest_job.send(self)
146
150
 
147
151
  try:
148
- self.initialize()
149
- self.job.status = 'initialized'
150
- if not self.dryrun:
151
- self.job.save()
152
+ self.inner_harvest()
153
+
154
+ if self.source.autoarchive:
155
+ self.autoarchive()
156
+
157
+ self.job.status = 'done'
158
+
159
+ if any(i.status == 'failed' for i in self.job.items):
160
+ self.job.status += '-errors'
152
161
  except HarvestValidationError as e:
153
- log.info('Initialization failed for "%s" (%s)',
154
- safe_unicode(self.source.name), self.source.backend)
155
- error = HarvestError(message=safe_unicode(e))
156
- self.job.errors.append(error)
157
- self.job.status = 'failed'
158
- self.end()
159
- return None
160
- except Exception as e:
162
+ log.exception(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})')
163
+
161
164
  self.job.status = 'failed'
165
+
162
166
  error = HarvestError(message=safe_unicode(e))
163
167
  self.job.errors.append(error)
164
- self.end()
165
- msg = 'Initialization failed for "{0.name}" ({0.backend})'
166
- log.exception(msg.format(self.source))
167
- return None
168
+ except Exception as e:
169
+ log.exception(f'Harvesting failed for "{safe_unicode(self.source.name)}" ({self.source.backend})')
168
170
 
169
- if self.max_items:
170
- self.job.items = self.job.items[:self.max_items]
171
+ self.job.status = 'failed'
171
172
 
172
- if self.job.items:
173
- log.debug('Queued %s items', len(self.job.items))
173
+ error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
174
+ self.job.errors.append(error)
175
+ finally:
176
+ self.end_job()
177
+
178
+ return self.job
174
179
 
175
- return len(self.job.items)
180
+ def process_dataset(self, remote_id: str, **kwargs):
181
+ log.debug(f'Processing dataset {remote_id}…')
176
182
 
177
- def initialize(self):
178
- raise NotImplementedError
183
+ # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice`
184
+ item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id)
185
+ self.job.items.append(item)
186
+ self.save_job()
179
187
 
180
- def process_items(self):
181
- '''Process the data identified in the initialize stage'''
182
- for item in self.job.items:
183
- self.process_item(item)
188
+ try:
189
+ if not remote_id:
190
+ raise HarvestSkipException("missing identifier")
184
191
 
185
- def process_item(self, item):
186
- log.debug('Processing: %s', item.remote_id)
187
- item.status = 'started'
188
- item.started = datetime.utcnow()
189
- if not self.dryrun:
190
- self.job.save()
192
+ dataset = self.inner_process_dataset(item, **kwargs)
191
193
 
192
- try:
193
- dataset = self.process(item)
194
- if not dataset.harvest:
195
- dataset.harvest = HarvestDatasetMetadata()
196
- dataset.harvest.domain = self.source.domain
197
- dataset.harvest.remote_id = item.remote_id
198
- dataset.harvest.source_id = str(self.source.id)
199
- dataset.harvest.last_update = datetime.utcnow()
200
- dataset.harvest.backend = self.display_name
201
-
202
- # unset archived status if needed
203
- if dataset.harvest:
204
- dataset.harvest.archived_at = None
205
- dataset.harvest.archived = None
194
+ # Use `item.remote_id` because `inner_process_dataset` could have modified it.
195
+ dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id)
206
196
  dataset.archived = None
207
197
 
208
- # TODO permissions checking
209
- if not dataset.organization and not dataset.owner:
210
- if self.source.organization:
211
- dataset.organization = self.source.organization
212
- elif self.source.owner:
213
- dataset.owner = self.source.owner
214
-
215
- # TODO: Apply editble mappings
198
+ # TODO: Apply editable mappings
216
199
 
217
200
  if self.dryrun:
218
201
  dataset.validate()
@@ -221,26 +204,54 @@ class BaseBackend(object):
221
204
  item.dataset = dataset
222
205
  item.status = 'done'
223
206
  except HarvestSkipException as e:
224
- log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e))
225
207
  item.status = 'skipped'
208
+
209
+ log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}')
226
210
  item.errors.append(HarvestError(message=safe_unicode(e)))
227
211
  except HarvestValidationError as e:
228
- log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e))
229
212
  item.status = 'failed'
213
+
214
+ log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}')
230
215
  item.errors.append(HarvestError(message=safe_unicode(e)))
231
216
  except Exception as e:
232
- log.exception('Error while processing %s : %s',
233
- item.remote_id,
234
- safe_unicode(e))
235
- error = HarvestError(message=safe_unicode(e),
236
- details=traceback.format_exc())
237
- item.errors.append(error)
238
217
  item.status = 'failed'
218
+ log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}')
219
+
220
+ error = HarvestError(message=safe_unicode(e), details=traceback.format_exc())
221
+ item.errors.append(error)
222
+ finally:
223
+ item.ended = datetime.utcnow()
224
+ self.save_job()
225
+
226
+ def is_done(self) -> bool:
227
+ '''Should be called after process_dataset to know if we reach the max items'''
228
+ return self.max_items and len(self.job.items) >= self.max_items
229
+
230
+ def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int):
231
+ if not harvest:
232
+ harvest = HarvestDatasetMetadata()
233
+ harvest.domain = self.source.domain
234
+ harvest.remote_id = remote_id
235
+ harvest.source_id = str(self.source.id)
236
+ harvest.last_update = datetime.utcnow()
237
+ harvest.backend = self.display_name
239
238
 
240
- item.ended = datetime.utcnow()
239
+ harvest.archived_at = None
240
+ harvest.archived = None
241
+
242
+ return harvest
243
+
244
+ def save_job(self):
245
+ if not self.dryrun:
246
+ self.job.save()
247
+
248
+ def end_job(self):
249
+ self.job.ended = datetime.utcnow()
241
250
  if not self.dryrun:
242
251
  self.job.save()
243
252
 
253
+ after_harvest_job.send(self)
254
+
244
255
  def autoarchive(self):
245
256
  '''
246
257
  Archive items that exist on the local instance but not on remote platform
@@ -262,34 +273,13 @@ class BaseBackend(object):
262
273
  archive_harvested_dataset(dataset, reason='not-on-remote', dryrun=self.dryrun)
263
274
  # add a HarvestItem to the job list (useful for report)
264
275
  # even when archiving has already been done (useful for debug)
265
- item = self.add_item(dataset.harvest.remote_id)
266
- item.dataset = dataset
267
- item.status = 'archived'
268
-
269
- if not self.dryrun:
270
- self.job.save()
276
+ self.job.items.append(HarvestItem(
277
+ remote_id=str(dataset.harvest.remote_id),
278
+ dataset=dataset,
279
+ status='archived'
280
+ ))
271
281
 
272
- def process(self, item):
273
- raise NotImplementedError
274
-
275
- def add_item(self, identifier, *args, **kwargs):
276
- item = HarvestItem(remote_id=str(identifier), args=args, kwargs=kwargs)
277
- self.job.items.append(item)
278
- return item
279
-
280
- def finalize(self):
281
- if self.source.autoarchive:
282
- self.autoarchive()
283
- self.job.status = 'done'
284
- if any(i.status == 'failed' for i in self.job.items):
285
- self.job.status += '-errors'
286
- self.end()
287
-
288
- def end(self):
289
- self.job.ended = datetime.utcnow()
290
- if not self.dryrun:
291
- self.job.save()
292
- after_harvest_job.send(self)
282
+ self.save_job()
293
283
 
294
284
  def get_dataset(self, remote_id):
295
285
  '''Get or create a dataset given its remote ID (and its source)
@@ -352,4 +342,4 @@ class BaseBackend(object):
352
342
  msg = str(error)
353
343
  errors.append(msg)
354
344
  msg = '\n- '.join(['Validation error:'] + errors)
355
- raise HarvestValidationError(msg)
345
+ raise HarvestValidationError(msg)
@@ -7,13 +7,15 @@ import boto3
7
7
  from flask import current_app
8
8
  from datetime import date
9
9
  import json
10
- from typing import List
10
+ from typing import Generator, List
11
11
 
12
+ from udata.core.dataset.models import Dataset
12
13
  from udata.rdf import (
13
14
  DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
14
15
  )
15
16
  from udata.core.dataset.rdf import dataset_from_rdf
16
17
  from udata.storage.s3 import store_as_json, get_from_json
18
+ from udata.harvest.models import HarvestItem
17
19
 
18
20
  from .base import BaseBackend
19
21
 
@@ -59,14 +61,17 @@ def extract_graph(source, target, node, specs):
59
61
  class DcatBackend(BaseBackend):
60
62
  display_name = 'DCAT'
61
63
 
62
- def initialize(self):
63
- '''List all datasets for a given ...'''
64
+ def inner_harvest(self):
64
65
  fmt = self.get_format()
65
- graphs = self.parse_graph(self.source.url, fmt)
66
-
67
66
  self.job.data = { 'format': fmt }
68
67
 
69
- serialized_graphs = [graph.serialize(format=fmt, indent=None) for graph in graphs]
68
+ serialized_graphs = []
69
+
70
+ for page_number, page in self.walk_graph(self.source.url, fmt):
71
+ self.process_one_datasets_page(page_number, page)
72
+ serialized_graphs.append(page.serialize(format=fmt, indent=None))
73
+
74
+ # TODO call `walk_graph` with `process_dataservices`
70
75
 
71
76
  # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
72
77
  max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
@@ -105,13 +110,11 @@ class DcatBackend(BaseBackend):
105
110
  raise ValueError(msg)
106
111
  return fmt
107
112
 
108
- def parse_graph(self, url, fmt) -> List[Graph]:
113
+ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
109
114
  """
110
- Returns an instance of rdflib.Graph for each detected page
111
- The index in the list is the page number
115
+ Yield all RDF pages as `Graph` from the source
112
116
  """
113
- graphs = []
114
- page = 0
117
+ page_number = 0
115
118
  while url:
116
119
  subgraph = Graph(namespace_manager=namespace_manager)
117
120
  response = self.get(url)
@@ -128,19 +131,26 @@ class DcatBackend(BaseBackend):
128
131
  pagination = subgraph.resource(pagination)
129
132
  url = url_from_rdf(pagination, prop)
130
133
  break
131
- graphs.append(subgraph)
132
134
 
133
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
134
- id = subgraph.value(node, DCT.identifier)
135
- kwargs = {'page': page}
136
- self.add_item(id, **kwargs)
137
- if self.max_items and len(self.job.items) >= self.max_items:
138
- # this will stop iterating on pagination
139
- url = None
135
+ yield page_number, subgraph
136
+ if self.is_done():
137
+ return
140
138
 
141
- page += 1
139
+ page_number += 1
140
+
141
+ def process_one_datasets_page(self, page_number: int, page: Graph):
142
+ for node in page.subjects(RDF.type, DCAT.Dataset):
143
+ remote_id = page.value(node, DCT.identifier)
144
+ self.process_dataset(remote_id, page_number=page_number, page=page, node=node)
145
+
146
+ if self.is_done():
147
+ return
148
+
149
+ def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
150
+ item.kwargs['page_number'] = page_number
142
151
 
143
- return graphs
152
+ dataset = self.get_dataset(item.remote_id)
153
+ return dataset_from_rdf(page, dataset, node=node)
144
154
 
145
155
  def get_node_from_item(self, graph, item):
146
156
  for node in graph.subjects(RDF.type, DCAT.Dataset):
@@ -148,33 +158,6 @@ class DcatBackend(BaseBackend):
148
158
  return node
149
159
  raise ValueError(f'Unable to find dataset with DCT.identifier:{item.remote_id}')
150
160
 
151
- def process(self, item):
152
- if item.remote_id == 'None':
153
- raise ValueError('The DCT.identifier is missing on this DCAT.Dataset record')
154
- graph = Graph(namespace_manager=namespace_manager)
155
-
156
- if self.job.data.get('graphs') is not None:
157
- graphs = self.job.data['graphs']
158
- else:
159
- bucket = current_app.config.get('HARVEST_GRAPHS_S3_BUCKET')
160
- if bucket is None:
161
- raise ValueError(f"No bucket configured but the harvest job item {item.id} on job {self.job.id} doesn't have a graph in MongoDB.")
162
-
163
- graphs = get_from_json(bucket, self.job.data['filename'])
164
- if graphs is None:
165
- raise ValueError(f"The file '{self.job.data['filename']}' is missing in S3 bucket '{bucket}'")
166
-
167
- data = graphs[item.kwargs['page']]
168
- format = self.job.data['format']
169
-
170
- graph.parse(data=bytes(data, encoding='utf8'), format=format)
171
- node = self.get_node_from_item(graph, item)
172
-
173
- dataset = self.get_dataset(item.remote_id)
174
- dataset = dataset_from_rdf(graph, dataset, node=node)
175
- return dataset
176
-
177
-
178
161
  def next_record_if_should_continue(self, start, search_results):
179
162
  next_record = int(search_results.attrib['nextRecord'])
180
163
  matched_count = int(search_results.attrib['numberOfRecordsMatched'])
@@ -209,7 +192,10 @@ class CswDcatBackend(DcatBackend):
209
192
 
210
193
  DCAT_SCHEMA = 'http://www.w3.org/ns/dcat#'
211
194
 
212
- def parse_graph(self, url: str, fmt: str) -> List[Graph]:
195
+ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
196
+ """
197
+ Yield all RDF pages as `Graph` from the source
198
+ """
213
199
  body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
214
200
  xmlns:gmd="http://www.isotc211.org/2005/gmd"
215
201
  service="CSW" version="2.0.2" resultType="results"
@@ -227,8 +213,7 @@ class CswDcatBackend(DcatBackend):
227
213
  </csw:GetRecords>'''
228
214
  headers = {'Content-Type': 'application/xml'}
229
215
 
230
- graphs = []
231
- page = 0
216
+ page_number = 0
232
217
  start = 1
233
218
 
234
219
  response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
@@ -239,36 +224,29 @@ class CswDcatBackend(DcatBackend):
239
224
  if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
240
225
  raise ValueError(f'Failed to query CSW:\n{content}')
241
226
  while tree:
242
- graph = Graph(namespace_manager=namespace_manager)
243
227
  search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
244
228
  if search_results is None:
245
- log.error(f'No search results found for {url} on page {page}')
229
+ log.error(f'No search results found for {url} on page {page_number}')
246
230
  break
247
231
  for child in search_results:
248
232
  subgraph = Graph(namespace_manager=namespace_manager)
249
233
  subgraph.parse(data=ET.tostring(child), format=fmt)
250
- graph += subgraph
251
234
 
252
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
253
- id = subgraph.value(node, DCT.identifier)
254
- kwargs = {'nid': str(node), 'page': page}
255
- kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
256
- self.add_item(id, **kwargs)
257
- graphs.append(graph)
235
+ yield page_number, subgraph
236
+ if self.is_done():
237
+ return
258
238
 
259
239
  next_record = self.next_record_if_should_continue(start, search_results)
260
240
  if not next_record:
261
241
  break
262
242
 
263
243
  start = next_record
264
- page += 1
244
+ page_number += 1
265
245
 
266
246
  tree = ET.fromstring(
267
247
  self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
268
248
  headers=headers).content)
269
249
 
270
- return graphs
271
-
272
250
 
273
251
  class CswIso19139DcatBackend(DcatBackend):
274
252
  '''
@@ -282,13 +260,10 @@ class CswIso19139DcatBackend(DcatBackend):
282
260
 
283
261
  XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
284
262
 
285
- def parse_graph(self, url: str, fmt: str) -> List[Graph]:
286
- '''
287
- Parse CSW graph querying ISO schema.
288
- Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
289
- See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
290
- '''
291
-
263
+ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
264
+ """
265
+ Yield all RDF pages as `Graph` from the source
266
+ """
292
267
  # Load XSLT
293
268
  xsl = ET.fromstring(self.get(self.XSL_URL).content)
294
269
  transform = ET.XSLT(xsl)
@@ -320,8 +295,7 @@ class CswIso19139DcatBackend(DcatBackend):
320
295
  </csw:GetRecords>'''
321
296
  headers = {'Content-Type': 'application/xml'}
322
297
 
323
- graphs = []
324
- page = 0
298
+ page_number = 0
325
299
  start = 1
326
300
 
327
301
  response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
@@ -338,7 +312,7 @@ class CswIso19139DcatBackend(DcatBackend):
338
312
  # infos (useful for pagination)
339
313
  search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
340
314
  if search_results is None:
341
- log.error(f'No search results found for {url} on page {page}')
315
+ log.error(f'No search results found for {url} on page {page_number}')
342
316
  break
343
317
 
344
318
  subgraph = Graph(namespace_manager=namespace_manager)
@@ -347,19 +321,16 @@ class CswIso19139DcatBackend(DcatBackend):
347
321
  if not subgraph.subjects(RDF.type, DCAT.Dataset):
348
322
  raise ValueError("Failed to fetch CSW content")
349
323
 
350
- for node in subgraph.subjects(RDF.type, DCAT.Dataset):
351
- id = subgraph.value(node, DCT.identifier)
352
- kwargs = {'nid': str(node), 'page': page}
353
- kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
354
- self.add_item(id, **kwargs)
355
- graphs.append(subgraph)
324
+ yield page_number, subgraph
325
+ if self.is_done():
326
+ return
356
327
 
357
328
  next_record = self.next_record_if_should_continue(start, search_results)
358
329
  if not next_record:
359
330
  break
360
331
 
361
332
  start = next_record
362
- page += 1
333
+ page_number += 1
363
334
 
364
335
  response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
365
336
  headers=headers)
@@ -367,5 +338,3 @@ class CswIso19139DcatBackend(DcatBackend):
367
338
 
368
339
  tree_before_transform = ET.fromstring(response.content)
369
340
  tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
370
-
371
- return graphs
udata/harvest/tasks.py CHANGED
@@ -18,20 +18,10 @@ def harvest(self, ident):
18
18
  return # Ignore deleted sources
19
19
  Backend = backends.get(current_app, source.backend)
20
20
  backend = Backend(source)
21
- items = backend.perform_initialization()
22
- if items is None:
23
- pass
24
- elif items == 0:
25
- backend.finalize()
26
- else:
27
- finalize = harvest_job_finalize.s(backend.job.id)
28
- items = [
29
- harvest_job_item.s(backend.job.id, item.remote_id)
30
- for item in backend.job.items
31
- ]
32
- chord(items)(finalize)
33
-
34
21
 
22
+ backend.harvest()
23
+
24
+
35
25
 
36
26
  @task(ignore_result=False, route='low.harvest')
37
27
  def harvest_job_item(job_id, item_id):
@@ -6,9 +6,10 @@ from flask.signals import Namespace
6
6
 
7
7
  from udata.factories import ModelFactory
8
8
  from udata.core.dataset.factories import DatasetFactory
9
+ from udata.core.dataset.models import Dataset
9
10
 
10
11
  from .. import backends
11
- from ..models import HarvestSource, HarvestJob
12
+ from ..models import HarvestItem, HarvestSource, HarvestJob
12
13
 
13
14
 
14
15
  def dtfactory(start, end):
@@ -55,14 +56,20 @@ class FactoryBackend(backends.BaseBackend):
55
56
  backends.HarvestFeature('toggled', 'Toggled', 'A togglable', True),
56
57
  )
57
58
 
58
- def initialize(self):
59
+ def inner_harvest(self):
59
60
  mock_initialize.send(self)
60
61
  for i in range(self.config.get('count', DEFAULT_COUNT)):
61
- self.add_item(i)
62
+ self.process_dataset(str(i))
63
+ if self.is_done():
64
+ return
62
65
 
63
- def process(self, item):
64
- mock_process.send(self, item=item)
65
- return DatasetFactory.build(title='dataset-{0}'.format(item.remote_id))
66
+ def inner_process_dataset(self, item: HarvestItem):
67
+ mock_process.send(self, item=item.remote_id)
68
+
69
+ dataset = self.get_dataset(item.remote_id)
70
+ dataset.title = f'dataset-{item.remote_id}'
71
+
72
+ return dataset
66
73
 
67
74
 
68
75
  class MockBackendsMixin(object):
@@ -580,7 +580,7 @@ class ExecutionTestMixin(MockBackendsMixin):
580
580
 
581
581
  def test_error_on_item(self):
582
582
  def process(self, item):
583
- if item.remote_id == '1':
583
+ if item == '1':
584
584
  raise ValueError('test')
585
585
 
586
586
  source = HarvestSourceFactory(backend='factory')
@@ -723,7 +723,7 @@ class HarvestPreviewTest(MockBackendsMixin):
723
723
 
724
724
  def test_preview_with_error_on_item(self):
725
725
  def process(self, item):
726
- if item.remote_id == '1':
726
+ if item == '1':
727
727
  raise ValueError('test')
728
728
 
729
729
  source = HarvestSourceFactory(backend='factory')
@@ -5,6 +5,7 @@ from urllib.parse import urlparse
5
5
  from dateutil.parser import parse
6
6
  from voluptuous import Schema
7
7
 
8
+ from udata.harvest.models import HarvestItem
8
9
  from udata.utils import faker
9
10
  from udata.core.dataset import tasks
10
11
  from udata.core.dataset.factories import DatasetFactory
@@ -31,12 +32,16 @@ class FakeBackend(BaseBackend):
31
32
  HarvestFeature('enabled', 'A test feature enabled by default', default=True),
32
33
  )
33
34
 
34
- def initialize(self):
35
+ def inner_harvest(self):
35
36
  for i in range(self.source.config.get('nb_datasets', 3)):
36
- self.add_item('fake-{0}'.format(i))
37
+ remote_id = f'fake-{i}'
38
+ self.process_dataset(remote_id)
39
+ if self.is_done():
40
+ return
37
41
 
38
- def process(self, item):
42
+ def inner_process_dataset(self, item: HarvestItem):
39
43
  dataset = self.get_dataset(item.remote_id)
44
+
40
45
  for key, value in DatasetFactory.as_dict(visible=True).items():
41
46
  setattr(dataset, key, value)
42
47
  if self.source.config.get('last_modified'):
@@ -219,8 +224,7 @@ class BaseBackendTest:
219
224
  assert 'archived_at' not in dataset_no_arch.harvest
220
225
 
221
226
  # test unarchive: archive manually then relaunch harvest
222
- q = {'harvest__remote_id': 'fake-1'}
223
- dataset = Dataset.objects.get(**q)
227
+ dataset = Dataset.objects.get(**{'harvest__remote_id': 'fake-1'})
224
228
  dataset.archived = datetime.utcnow()
225
229
  dataset.harvest.archived = 'not-on-remote'
226
230
  dataset.harvest.archived_at = datetime.utcnow()
@@ -240,6 +240,19 @@ class DcatBackendTest:
240
240
  actions.purge_jobs()
241
241
  assert get_from_json(current_app.config.get('HARVEST_GRAPHS_S3_BUCKET'), job.data['filename']) is None
242
242
 
243
+ @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas', HARVEST_MAX_ITEMS=2)
244
+ def test_harvest_max_items(self, rmock):
245
+ rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
246
+
247
+ filename = 'bnodes.xml'
248
+ url = mock_dcat(rmock, filename)
249
+ org = OrganizationFactory()
250
+ source = HarvestSourceFactory(backend='dcat', url=url, organization=org)
251
+
252
+ actions.run(source.slug)
253
+
254
+ assert Dataset.objects.count() == 2
255
+ assert HarvestJob.objects.first().status == 'done'
243
256
 
244
257
  @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
245
258
  def test_harvest_spatial(self, rmock):