udata 10.8.2.dev37001__py2.py3-none-any.whl → 10.8.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (63) hide show
  1. udata/__init__.py +1 -1
  2. udata/core/activity/models.py +23 -1
  3. udata/core/dataset/api_fields.py +2 -0
  4. udata/core/dataset/apiv2.py +4 -0
  5. udata/core/dataset/constants.py +1 -0
  6. udata/core/dataset/csv.py +1 -0
  7. udata/core/dataset/forms.py +6 -0
  8. udata/core/dataset/metrics.py +34 -0
  9. udata/core/dataset/models.py +15 -3
  10. udata/core/dataset/tasks.py +0 -11
  11. udata/core/metrics/__init__.py +1 -0
  12. udata/core/metrics/commands.py +3 -0
  13. udata/core/organization/csv.py +9 -26
  14. udata/core/organization/metrics.py +2 -0
  15. udata/core/organization/models.py +14 -9
  16. udata/core/user/metrics.py +2 -0
  17. udata/harvest/backends/dcat.py +161 -165
  18. udata/harvest/tests/ckan/test_ckan_backend.py +1 -1
  19. udata/harvest/tests/dcat/catalog.xml +1 -0
  20. udata/harvest/tests/test_dcat_backend.py +19 -6
  21. udata/migrations/2025-07-30-purge-old-harvest-dynamic-fields.py +29 -0
  22. udata/settings.py +1 -1
  23. udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.d9c1735d14038b94c17e.js} +2 -2
  24. udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.d9c1735d14038b94c17e.js.map} +1 -1
  25. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.81c57c0dedf812e43013.js} +2 -2
  26. udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.81c57c0dedf812e43013.js.map} +1 -1
  27. udata/static/chunks/{19.f03a102365af4315f9db.js → 19.8d03c06efcac6884bebe.js} +3 -3
  28. udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.8d03c06efcac6884bebe.js.map} +1 -1
  29. udata/static/chunks/{5.0fa1408dae4e76b87b2e.js → 5.343ca020a2d38cec1a14.js} +3 -3
  30. udata/static/chunks/{5.0fa1408dae4e76b87b2e.js.map → 5.343ca020a2d38cec1a14.js.map} +1 -1
  31. udata/static/chunks/{6.d663709d877baa44a71e.js → 6.a3b07de9dd2ca2d24e85.js} +3 -3
  32. udata/static/chunks/{6.d663709d877baa44a71e.js.map → 6.a3b07de9dd2ca2d24e85.js.map} +1 -1
  33. udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.b966402f5d680d4bdf4a.js} +2 -2
  34. udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.b966402f5d680d4bdf4a.js.map} +1 -1
  35. udata/static/common.js +1 -1
  36. udata/static/common.js.map +1 -1
  37. udata/tests/api/test_datasets_api.py +0 -46
  38. udata/tests/dataset/test_dataset_model.py +63 -17
  39. udata/tests/organization/test_csv_adapter.py +3 -15
  40. udata/tests/reuse/test_reuse_model.py +6 -4
  41. udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
  42. udata/translations/ar/LC_MESSAGES/udata.po +62 -54
  43. udata/translations/de/LC_MESSAGES/udata.mo +0 -0
  44. udata/translations/de/LC_MESSAGES/udata.po +62 -54
  45. udata/translations/es/LC_MESSAGES/udata.mo +0 -0
  46. udata/translations/es/LC_MESSAGES/udata.po +62 -54
  47. udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
  48. udata/translations/fr/LC_MESSAGES/udata.po +62 -54
  49. udata/translations/it/LC_MESSAGES/udata.mo +0 -0
  50. udata/translations/it/LC_MESSAGES/udata.po +62 -54
  51. udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
  52. udata/translations/pt/LC_MESSAGES/udata.po +62 -54
  53. udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
  54. udata/translations/sr/LC_MESSAGES/udata.po +62 -54
  55. udata/translations/udata.pot +63 -56
  56. udata/utils.py +16 -0
  57. {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/METADATA +16 -3
  58. {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/RECORD +62 -61
  59. udata/harvest/backends/ckan/models.py +0 -10
  60. {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/LICENSE +0 -0
  61. {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/WHEEL +0 -0
  62. {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/entry_points.txt +0 -0
  63. {udata-10.8.2.dev37001.dist-info → udata-10.8.3.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  import logging
2
2
  from datetime import date
3
- from typing import Generator
3
+ from typing import ClassVar, Generator
4
4
 
5
5
  import lxml.etree as ET
6
6
  from flask import current_app
7
7
  from rdflib import Graph
8
8
  from rdflib.namespace import RDF
9
+ from typing_extensions import override
9
10
 
10
11
  from udata.core.dataservices.rdf import dataservice_from_rdf
11
12
  from udata.core.dataset.rdf import dataset_from_rdf
@@ -55,9 +56,6 @@ URIS_TO_REPLACE = {
55
56
  }
56
57
 
57
58
 
58
- SAFE_PARSER = ET.XMLParser(resolve_entities=False)
59
-
60
-
61
59
  def extract_graph(source, target, node, specs):
62
60
  for p, o in source.predicate_objects(node):
63
61
  target.add((node, p, o))
@@ -68,20 +66,28 @@ def extract_graph(source, target, node, specs):
68
66
  class DcatBackend(BaseBackend):
69
67
  display_name = "DCAT"
70
68
 
69
+ def __init__(self, *args, **kwargs):
70
+ super().__init__(*args, **kwargs)
71
+ self.organizations_to_update = set()
72
+
71
73
  def inner_harvest(self):
72
74
  fmt = self.get_format()
73
75
  self.job.data = {"format": fmt}
74
76
 
75
- serialized_graphs = []
77
+ pages = []
76
78
 
77
79
  for page_number, page in self.walk_graph(self.source.url, fmt):
78
80
  self.process_one_datasets_page(page_number, page)
79
- serialized_graphs.append(page.serialize(format=fmt, indent=None))
81
+ pages.append((page_number, page))
82
+
83
+ for org in self.organizations_to_update:
84
+ org.compute_aggregate_metrics = True
85
+ org.count_datasets()
80
86
 
81
87
  # We do a second pass to have all datasets in memory and attach datasets
82
88
  # to dataservices. It could be better to be one pass of graph walking and
83
89
  # then one pass of attaching datasets to dataservices.
84
- for page_number, page in self.walk_graph(self.source.url, fmt):
90
+ for page_number, page in pages:
85
91
  self.process_one_dataservices_page(page_number, page)
86
92
 
87
93
  if not self.dryrun and self.has_reached_max_items():
@@ -100,6 +106,8 @@ class DcatBackend(BaseBackend):
100
106
 
101
107
  bucket = current_app.config.get("HARVEST_GRAPHS_S3_BUCKET")
102
108
 
109
+ serialized_graphs = [p.serialize(format=fmt, indent=None) for _, p in pages]
110
+
103
111
  if (
104
112
  bucket is not None
105
113
  and sum([len(g.encode("utf-8")) for g in serialized_graphs])
@@ -202,7 +210,10 @@ class DcatBackend(BaseBackend):
202
210
  )
203
211
 
204
212
  def process_one_dataservices_page(self, page_number: int, page: Graph):
213
+ access_services = {o for _, _, o in page.triples((None, DCAT.accessService, None))}
205
214
  for node in page.subjects(RDF.type, DCAT.DataService):
215
+ if node in access_services:
216
+ continue
206
217
  remote_id = page.value(node, DCT.identifier)
207
218
  self.process_dataservice(remote_id, page_number=page_number, page=page, node=node)
208
219
 
@@ -214,7 +225,11 @@ class DcatBackend(BaseBackend):
214
225
 
215
226
  dataset = self.get_dataset(item.remote_id)
216
227
  remote_url_prefix = self.get_extra_config_value("remote_url_prefix")
217
- return dataset_from_rdf(page, dataset, node=node, remote_url_prefix=remote_url_prefix)
228
+ dataset = dataset_from_rdf(page, dataset, node=node, remote_url_prefix=remote_url_prefix)
229
+ if dataset.organization:
230
+ dataset.organization.compute_aggregate_metrics = False
231
+ self.organizations_to_update.add(dataset.organization)
232
+ return dataset
218
233
 
219
234
  def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node):
220
235
  item.kwargs["page_number"] = page_number
@@ -235,104 +250,165 @@ class DcatBackend(BaseBackend):
235
250
  return node
236
251
  raise ValueError(f"Unable to find dataset with DCT.identifier:{item.remote_id}")
237
252
 
238
- def next_record_if_should_continue(self, start, search_results):
239
- next_record = int(search_results.attrib["nextRecord"])
240
- matched_count = int(search_results.attrib["numberOfRecordsMatched"])
241
- returned_count = int(search_results.attrib["numberOfRecordsReturned"])
242
253
 
243
- # Break conditions copied gratefully from
244
- # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
245
- break_conditions = (
246
- # standard CSW: A value of 0 means all records have been returned.
247
- next_record == 0,
248
- # Misbehaving CSW server returning a next record > matched count
249
- next_record > matched_count,
250
- # No results returned already
251
- returned_count == 0,
252
- # Current next record is lower than previous one
253
- next_record < start,
254
- # Enough items have been harvested already
255
- self.max_items and len(self.job.items) >= self.max_items,
256
- )
254
+ class CswDcatBackend(DcatBackend):
255
+ """
256
+ CSW harvester fetching records as DCAT.
257
+ The parsing of items is then the same as for the DcatBackend.
258
+ """
257
259
 
258
- if any(break_conditions):
259
- return None
260
- else:
261
- return next_record
260
+ display_name = "CSW-DCAT"
262
261
 
262
+ # CSW_REQUEST is based on:
263
+ # - Request syntax from spec [1] and example requests [1] [2].
264
+ # - Sort settings to ensure stable paging [3].
265
+ # - Filter settings to only retrieve record types currently mapped in udata.
266
+ #
267
+ # If you modify the request, make sure:
268
+ # - `typeNames` and `outputSchema` are consistent. You'll likely want to keep "gmd:MD_Metadata",
269
+ # since "csw:Record" contains less information.
270
+ # - `typeNames` and namespaces in `csw:Query` (`Filter`, `SortBy`, ...) are consistent, although
271
+ # they are ignored on some servers [4] [5].
272
+ # - It works on real catalogs! Not many servers implement the whole spec.
273
+ #
274
+ # References:
275
+ # [1] OpenGIS Catalogue Services Specification 2.0.2 – ISO Metadata Application Profile: Corrigendum
276
+ # https://portal.ogc.org/files/80534
277
+ # [2] GeoNetwork - CSW test requests
278
+ # https://github.com/geonetwork/core-geonetwork/tree/3.10.4/web/src/main/webapp/xml/csw/test
279
+ # [3] Udata - Support csw dcat harvest
280
+ # https://github.com/opendatateam/udata/pull/2800#discussion_r1129053500
281
+ # [4] GeoNetwork - GetRecords ignores namespaces for Filter/SortBy fields
282
+ # https://github.com/geonetwork/core-geonetwork/blob/3.10.4/csw-server/src/main/java/org/fao/geonet/kernel/csw/services/getrecords/FieldMapper.java#L92
283
+ # [5] GeoNetwork - GetRecords ignores `typeNames`
284
+ # https://github.com/geonetwork/core-geonetwork/blob/3.10.4/csw-server/src/main/java/org/fao/geonet/kernel/csw/services/getrecords/CatalogSearcher.java#L194
285
+ CSW_REQUEST: ClassVar[str] = """
286
+ <csw:GetRecords xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0"
287
+ xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
288
+ xmlns:ogc="http://www.opengis.net/ogc"
289
+ service="CSW" version="2.0.2" outputFormat="application/xml"
290
+ resultType="results" startPosition="{start}" maxRecords="25"
291
+ outputSchema="{output_schema}">
292
+ <csw:Query typeNames="gmd:MD_Metadata">
293
+ <csw:ElementSetName>full</csw:ElementSetName>
294
+ <csw:Constraint version="1.1.0">
295
+ <ogc:Filter>
296
+ <ogc:Or>
297
+ <ogc:PropertyIsEqualTo>
298
+ <ogc:PropertyName>apiso:type</ogc:PropertyName>
299
+ <ogc:Literal>dataset</ogc:Literal>
300
+ </ogc:PropertyIsEqualTo>
301
+ <ogc:PropertyIsEqualTo>
302
+ <ogc:PropertyName>apiso:type</ogc:PropertyName>
303
+ <ogc:Literal>nonGeographicDataset</ogc:Literal>
304
+ </ogc:PropertyIsEqualTo>
305
+ <ogc:PropertyIsEqualTo>
306
+ <ogc:PropertyName>apiso:type</ogc:PropertyName>
307
+ <ogc:Literal>series</ogc:Literal>
308
+ </ogc:PropertyIsEqualTo>
309
+ <ogc:PropertyIsEqualTo>
310
+ <ogc:PropertyName>apiso:type</ogc:PropertyName>
311
+ <ogc:Literal>service</ogc:Literal>
312
+ </ogc:PropertyIsEqualTo>
313
+ </ogc:Or>
314
+ </ogc:Filter>
315
+ </csw:Constraint>
316
+ <ogc:SortBy>
317
+ <ogc:SortProperty>
318
+ <ogc:PropertyName>apiso:identifier</ogc:PropertyName>
319
+ <ogc:SortOrder>ASC</ogc:SortOrder>
320
+ </ogc:SortProperty>
321
+ </ogc:SortBy>
322
+ </csw:Query>
323
+ </csw:GetRecords>
324
+ """
263
325
 
264
- class CswDcatBackend(DcatBackend):
265
- display_name = "CSW-DCAT"
326
+ CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"
266
327
 
267
- DCAT_SCHEMA = "http://www.w3.org/ns/dcat#"
328
+ def __init__(self, *args, **kwargs):
329
+ super().__init__(*args, **kwargs)
330
+ self.xml_parser = ET.XMLParser(resolve_entities=False)
268
331
 
269
332
  def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
270
333
  """
271
334
  Yield all RDF pages as `Graph` from the source
272
335
  """
273
- body = """<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
274
- xmlns:gmd="http://www.isotc211.org/2005/gmd"
275
- service="CSW" version="2.0.2" resultType="results"
276
- startPosition="{start}" maxPosition="200"
277
- outputSchema="{schema}">
278
- <csw:Query typeNames="gmd:MD_Metadata">
279
- <csw:ElementSetName>full</csw:ElementSetName>
280
- <ogc:SortBy xmlns:ogc="http://www.opengis.net/ogc">
281
- <ogc:SortProperty>
282
- <ogc:PropertyName>identifier</ogc:PropertyName>
283
- <ogc:SortOrder>ASC</ogc:SortOrder>
284
- </ogc:SortProperty>
285
- </ogc:SortBy>
286
- </csw:Query>
287
- </csw:GetRecords>"""
288
- headers = {"Content-Type": "application/xml"}
289
-
290
336
  page_number = 0
291
337
  start = 1
292
338
 
293
- response = self.post(
294
- url, data=body.format(start=start, schema=self.DCAT_SCHEMA), headers=headers
295
- )
296
- response.raise_for_status()
297
- content = response.content
298
- tree = ET.fromstring(content, parser=SAFE_PARSER)
299
- if tree.tag == "{" + OWS_NAMESPACE + "}ExceptionReport":
300
- raise ValueError(f"Failed to query CSW:\n{content}")
301
- while tree is not None:
339
+ while True:
340
+ data = self.CSW_REQUEST.format(output_schema=self.CSW_OUTPUT_SCHEMA, start=start)
341
+ response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
342
+ response.raise_for_status()
343
+
344
+ content = response.content
345
+ tree = ET.fromstring(content, parser=self.xml_parser)
346
+ if tree.tag == "{" + OWS_NAMESPACE + "}ExceptionReport":
347
+ raise ValueError(f"Failed to query CSW:\n{content}")
348
+
302
349
  search_results = tree.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
303
- if search_results is None:
350
+ if not search_results:
304
351
  log.error(f"No search results found for {url} on page {page_number}")
305
- break
306
- for child in search_results:
352
+ return
353
+
354
+ for result in search_results:
307
355
  subgraph = Graph(namespace_manager=namespace_manager)
308
- subgraph.parse(data=ET.tostring(child), format=fmt)
356
+ doc = ET.tostring(self.as_dcat(result))
357
+ subgraph.parse(data=doc, format=fmt)
358
+
359
+ if not subgraph.subjects(
360
+ RDF.type, [DCAT.Dataset, DCAT.DatasetSeries, DCAT.DataService]
361
+ ):
362
+ raise ValueError("Failed to fetch CSW content")
309
363
 
310
364
  yield page_number, subgraph
365
+
311
366
  if self.has_reached_max_items():
312
367
  return
313
368
 
314
- next_record = self.next_record_if_should_continue(start, search_results)
315
- if not next_record:
316
- break
317
-
318
- start = next_record
319
369
  page_number += 1
370
+ start = self.next_position(start, search_results)
371
+ if not start:
372
+ return
320
373
 
321
- tree = ET.fromstring(
322
- self.post(
323
- url, data=body.format(start=start, schema=self.DCAT_SCHEMA), headers=headers
324
- ).content,
325
- parser=SAFE_PARSER,
326
- )
374
+ def as_dcat(self, tree: ET._Element) -> ET._Element:
375
+ """
376
+ Return the input tree as a DCAT tree.
377
+ For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
378
+ For subclasses of CswDcatBackend, this method should convert the incoming tree to DCAT.
379
+ """
380
+ return tree
327
381
 
382
+ def next_position(self, start: int, search_results: ET._Element) -> int | None:
383
+ next_record = int(search_results.attrib["nextRecord"])
384
+ matched_count = int(search_results.attrib["numberOfRecordsMatched"])
385
+ returned_count = int(search_results.attrib["numberOfRecordsReturned"])
328
386
 
329
- class CswIso19139DcatBackend(DcatBackend):
387
+ # Break conditions copied gratefully from
388
+ # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
389
+ should_break = (
390
+ # A value of 0 means all records have been returned (standard CSW)
391
+ (next_record == 0)
392
+ # Misbehaving CSW server returning a next record > matched count
393
+ or (next_record > matched_count)
394
+ # No results returned already
395
+ or (returned_count == 0)
396
+ # Current next record is lower than previous one
397
+ or (next_record < start)
398
+ # Enough items have been harvested already
399
+ or self.has_reached_max_items()
400
+ )
401
+ return None if should_break else next_record
402
+
403
+
404
+ class CswIso19139DcatBackend(CswDcatBackend):
330
405
  """
331
- An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
406
+ CSW harvester fetching records as ISO-19139 and using XSLT to convert them to DCAT.
332
407
  The parsing of items is then the same as for the DcatBackend.
333
408
  """
334
409
 
335
410
  display_name = "CSW-ISO-19139"
411
+
336
412
  extra_configs = (
337
413
  HarvestExtraConfig(
338
414
  _("Remote URL prefix"),
@@ -342,94 +418,14 @@ class CswIso19139DcatBackend(DcatBackend):
342
418
  ),
343
419
  )
344
420
 
345
- ISO_SCHEMA = "http://www.isotc211.org/2005/gmd"
346
-
347
- def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
348
- """
349
- Yield all RDF pages as `Graph` from the source
350
-
351
- Parse CSW graph querying ISO schema.
352
- Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
353
- See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
354
- """
355
- # Load XSLT
356
- xsl_url = current_app.config["HARVEST_ISO19139_XSL_URL"]
357
- xsl = ET.fromstring(self.get(xsl_url).content, parser=SAFE_PARSER)
358
- transform = ET.XSLT(xsl)
359
-
360
- # Start querying and parsing graph
361
- # Filter on dataset or serie records
362
- body = """<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
363
- xmlns:gmd="http://www.isotc211.org/2005/gmd"
364
- service="CSW" version="2.0.2" resultType="results"
365
- startPosition="{start}" maxPosition="10"
366
- outputSchema="{schema}">
367
- <csw:Query typeNames="csw:Record">
368
- <csw:ElementSetName>full</csw:ElementSetName>
369
- <csw:Constraint version="1.1.0">
370
- <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
371
- <ogc:Or xmlns:ogc="http://www.opengis.net/ogc">
372
- <ogc:PropertyIsEqualTo>
373
- <ogc:PropertyName>dc:type</ogc:PropertyName>
374
- <ogc:Literal>dataset</ogc:Literal>
375
- </ogc:PropertyIsEqualTo>
376
- <ogc:PropertyIsEqualTo>
377
- <ogc:PropertyName>dc:type</ogc:PropertyName>
378
- <ogc:Literal>service</ogc:Literal>
379
- </ogc:PropertyIsEqualTo>
380
- <ogc:PropertyIsEqualTo>
381
- <ogc:PropertyName>dc:type</ogc:PropertyName>
382
- <ogc:Literal>series</ogc:Literal>
383
- </ogc:PropertyIsEqualTo>
384
- </ogc:Or>
385
- </ogc:Filter>
386
- </csw:Constraint>
387
- </csw:Query>
388
- </csw:GetRecords>"""
389
- headers = {"Content-Type": "application/xml"}
390
-
391
- page_number = 0
392
- start = 1
393
-
394
- response = self.post(
395
- url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers
396
- )
397
- response.raise_for_status()
398
-
399
- tree_before_transform = ET.fromstring(response.content, parser=SAFE_PARSER)
400
- # Disabling CoupledResourceLookUp to prevent failure on xlink:href
401
- # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
402
- tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
403
-
404
- while tree:
405
- # We query the tree before the transformation because the XSLT remove the search results
406
- # infos (useful for pagination)
407
- search_results = tree_before_transform.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
408
- if search_results is None:
409
- log.error(f"No search results found for {url} on page {page_number}")
410
- break
411
-
412
- subgraph = Graph(namespace_manager=namespace_manager)
413
- subgraph.parse(ET.tostring(tree), format=fmt)
414
-
415
- if not subgraph.subjects(RDF.type, DCAT.Dataset):
416
- raise ValueError("Failed to fetch CSW content")
421
+ CSW_OUTPUT_SCHEMA = "http://www.isotc211.org/2005/gmd"
417
422
 
418
- yield page_number, subgraph
419
- if self.has_reached_max_items():
420
- return
421
-
422
- next_record = self.next_record_if_should_continue(start, search_results)
423
- if not next_record:
424
- break
425
-
426
- start = next_record
427
- page_number += 1
428
-
429
- response = self.post(
430
- url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers
431
- )
432
- response.raise_for_status()
423
+ def __init__(self, *args, **kwargs):
424
+ super().__init__(*args, **kwargs)
425
+ xslt_url = current_app.config["HARVEST_ISO19139_XSLT_URL"]
426
+ xslt = ET.fromstring(self.get(xslt_url).content, parser=self.xml_parser)
427
+ self.transform = ET.XSLT(xslt)
433
428
 
434
- tree_before_transform = ET.fromstring(response.content, parser=SAFE_PARSER)
435
- tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
429
+ @override
430
+ def as_dcat(self, tree: ET._Element) -> ET._Element:
431
+ return self.transform(tree, CoupledResourceLookUp="'disabled'")
@@ -460,7 +460,7 @@ def test_skip_no_resources(source, result):
460
460
  def test_ckan_url_is_url(data, result):
461
461
  dataset = dataset_for(result)
462
462
  assert dataset.harvest.remote_url == data["url"]
463
- assert not hasattr(dataset.harvest, "ckan_source")
463
+ assert dataset.harvest.ckan_source is None
464
464
 
465
465
 
466
466
  @pytest.mark.ckan_data("ckan_url_is_a_string")
@@ -159,6 +159,7 @@
159
159
  <dcat:accessURL>http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&amp;REQUEST=GetCapabilities&amp;VERSION=1.3.0</dcat:accessURL>
160
160
  <dcat:accessService>
161
161
  <dcat:DataService>
162
+ <rdf:type rdf:resource="http://www.w3.org/ns/dcat#DataService"/>
162
163
  <dcterms:title xml:lang="fr">Geo Service</dcterms:title>
163
164
  <dcat:endpointURL rdf:resource="http://data.test.org/datasets/1/resources/4/services"/>
164
165
  <dcat:endpointDescription rdf:resource="http://data.test.org/datasets/1/resources/4/services?SERVICE=WMS&amp;REQUEST=GetCapabilities&amp;VERSION=1.3.0"/>
@@ -187,6 +187,23 @@ class DcatBackendTest:
187
187
  == "https://data.paris2024.org/api/explore/v2.1/console"
188
188
  )
189
189
 
190
+ def test_harvest_dataservices_ignore_accessservices(self, rmock):
191
+ rmock.get("https://example.com/schemas", json=ResourceSchemaMockData.get_mock_data())
192
+
193
+ url = mock_dcat(rmock, "catalog.xml")
194
+ org = OrganizationFactory()
195
+ source = HarvestSourceFactory(backend="dcat", url=url, organization=org)
196
+
197
+ actions.run(source)
198
+
199
+ source.reload()
200
+
201
+ job = source.get_last_job()
202
+ assert len(job.items) == 4
203
+
204
+ dataservices = Dataservice.objects
205
+ assert len(dataservices) == 0
206
+
190
207
  def test_harvest_literal_spatial(self, rmock):
191
208
  url = mock_dcat(rmock, "evian.json")
192
209
  org = OrganizationFactory()
@@ -478,12 +495,8 @@ class DcatBackendTest:
478
495
 
479
496
  assert job.status == "done"
480
497
  assert job.errors == []
481
- assert len(job.items) == 5
482
- # 4 datasets and one Dataservice mentionned but not described
483
- # because it appears in a distribution as DCAT.accessService
484
- # but is missing a proper DCT.identifier
498
+ assert len(job.items) == 4
485
499
  assert len([item for item in job.items if item.status == "done"]) == 4
486
- assert len([item for item in job.items if item.status == "skipped"]) == 1
487
500
 
488
501
  def test_xml_catalog(self, rmock):
489
502
  LicenseFactory(id="lov2", title="Licence Ouverte Version 2.0")
@@ -886,7 +899,7 @@ class CswIso19139DcatBackendTest:
886
899
  with open(os.path.join(CSW_DCAT_FILES_DIR, "XSLT.xml"), "r") as f:
887
900
  xslt = f.read()
888
901
  url = mock_csw_pagination(rmock, "geonetwork/srv/eng/csw.rdf", "geonetwork-iso-page-{}.xml")
889
- rmock.get(current_app.config.get("HARVEST_ISO19139_XSL_URL"), text=xslt)
902
+ rmock.get(current_app.config.get("HARVEST_ISO19139_XSLT_URL"), text=xslt)
890
903
  org = OrganizationFactory()
891
904
  source = HarvestSourceFactory(
892
905
  backend="csw-iso-19139",
@@ -0,0 +1,29 @@
1
+ """
2
+ This migration removes legacy harvest dynamic fields
3
+ """
4
+
5
+ import logging
6
+
7
+ from mongoengine.connection import get_db
8
+
9
+ log = logging.getLogger(__name__)
10
+
11
+
12
+ def migrate(db):
13
+ # Remove legacy fields (`ods_has_records`, `ods_url`, ...) from old harvested datasets and resources
14
+ dataset_legacy_fields = ["ods_has_records", "ods_url", "ods_geo"]
15
+ for field in dataset_legacy_fields:
16
+ result = get_db().dataset.update_many({}, {"$unset": {f"harvest.{field}": 1}})
17
+ log.info(
18
+ f"Harvest Dataset dynamic legacy fields ({field}) removed from {result.modified_count} objects"
19
+ )
20
+
21
+ resource_legacy_fields = ["ods_type"]
22
+ for field in resource_legacy_fields:
23
+ result = get_db().dataset.update_many(
24
+ {"resources": {"$exists": True, "$type": "array"}},
25
+ {"$unset": {f"resources.$[].harvest.{field}": 1}},
26
+ )
27
+ log.info(
28
+ f"Harvest Resource dynamic legacy fields ({field}) removed from {result.modified_count} objects"
29
+ )
udata/settings.py CHANGED
@@ -283,7 +283,7 @@ class Defaults(object):
283
283
  HARVEST_GRAPHS_S3_BUCKET = None # If the catalog is bigger than `HARVEST_MAX_CATALOG_SIZE_IN_MONGO` store the graph inside S3 instead of MongoDB
284
284
  HARVEST_GRAPHS_S3_FILENAME_PREFIX = "" # Useful to store the graphs inside a subfolder of the bucket. For example by setting `HARVEST_GRAPHS_S3_FILENAME_PREFIX = 'graphs/'`
285
285
 
286
- HARVEST_ISO19139_XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/refs/heads/geodcat-ap-2.0.0/iso-19139-to-dcat-ap.xsl"
286
+ HARVEST_ISO19139_XSLT_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/refs/heads/geodcat-ap-2.0.0/iso-19139-to-dcat-ap.xsl"
287
287
 
288
288
  # S3 connection details
289
289
  S3_URL = None