PyPI - udata - Versions diffs - 7.0.5.dev28172__py2.py3-none-any.whl → 7.0.6__py2.py3-none-any.whl - Mend

udata 7.0.5.dev28172py2.py3-none-any.whl → 7.0.6py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of udata might be problematic. Click here for more details.

Files changed (60) hide show

udata/__init__.py CHANGED Viewed

@@ -4,5 +4,5 @@
 udata
 '''
-__version__ = '7.0.5.dev'
+__version__ = '7.0.6'
 __description__ = 'Open data portal'

udata/commands/__init__.py CHANGED Viewed

@@ -145,9 +145,6 @@ def init_logging(app):
     handler.setFormatter(CliFormatter())
     handler.setLevel(log_level)
-    logger = logging.getLogger()
-    logger.addHandler(handler)
     logger = logging.getLogger('__main__')
     logger.setLevel(log_level)
     logger.handlers = []

udata/commands/dcat.py CHANGED Viewed

@@ -8,7 +8,7 @@ from rdflib import Graph
 from udata.commands import cli, green, yellow, cyan, echo, magenta
 from udata.core.dataset.factories import DatasetFactory
 from udata.core.dataset.rdf import dataset_from_rdf
-from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
+from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
 from udata.rdf import namespace_manager
 log = logging.getLogger(__name__)
@@ -23,9 +23,10 @@ def grp():
 @grp.command()
 @click.argument('url')
 @click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
-@click.option('-i', '--rid', help='Inspect specific remote id (contains)')
-@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
-def parse_url(url, csw, quiet=False, rid=''):
+@click.option('-r', '--rid', help='Inspect specific remote id (contains)')
+@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
+@click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
+def parse_url(url, csw, iso, quiet=False, rid=''):
     '''Parse the datasets in a DCAT format located at URL (debug)'''
     if quiet:
         verbose_loggers = ['rdflib', 'udata.core.dataset']
@@ -49,6 +50,8 @@ def parse_url(url, csw, quiet=False, rid=''):
     source.url = url
     if csw:
         backend = CswDcatBackend(source, dryrun=True)
+    elif iso:
+        backend = CswIso19139DcatBackend(source, dryrun=True)
     else:
         backend = DcatBackend(source, dryrun=True)
     backend.job = MockJob()

udata/core/activity/api.py CHANGED Viewed

@@ -37,7 +37,7 @@ activity_fields = api.model('Activity', {
         description='The key of the activity', required=True),
     'icon': fields.String(
         description='The icon of the activity', required=True),
-    'kwargs': fields.Raw(description='Some action specific context'),
+    'extras': fields.Raw(description='Extras attributes as key-value pairs'),
 })
 activity_page_fields = api.model('ActivityPage', fields.pager(activity_fields))

udata/core/activity/models.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
     related_to = db.ReferenceField(db.DomainModel, required=True)
     created_at = db.DateTimeField(default=datetime.utcnow, required=True)
-    kwargs = db.DictField()
+    extras = db.ExtrasField()
     on_new = Signal()
@@ -65,8 +65,9 @@ class Activity(db.Document, metaclass=EmitNewActivityMetaClass):
         return cls.on_new.connect(func, sender=cls)
     @classmethod
-    def emit(cls, related_to, organization=None, **kwargs):
+    def emit(cls, related_to, organization=None, extras=None):
         new_activity.send(cls,
                           related_to=related_to,
                           actor=current_user._get_current_object(),
-                          organization=organization)
+                          organization=organization,
+                          extras=extras)

udata/core/activity/tasks.py CHANGED Viewed

@@ -9,22 +9,23 @@ log = logging.getLogger(__name__)
 @new_activity.connect
-def delay_activity(cls, related_to, actor, organization=None):
+def delay_activity(cls, related_to, actor, organization=None, extras=None):
     emit_activity.delay(
         cls.__name__,
         str(actor.id),
         related_to_cls=related_to.__class__.__name__,
         related_to_id=str(related_to.id),
         organization_id=str(organization.id) if organization else None,
+        extras=extras
     )
 @task
 def emit_activity(classname, actor_id, related_to_cls, related_to_id,
-                  organization_id=None):
-    log.debug('Emit new activity: %s %s %s %s %s',
+                  organization_id=None, extras=None):
+    log.debug('Emit new activity: %s %s %s %s %s %s',
               classname, actor_id, related_to_cls,
-              related_to_id, organization_id)
+              related_to_id, organization_id, extras)
     cls = db.resolve_model(classname)
     actor = User.objects.get(pk=actor_id)
     related_to = db.resolve_model(related_to_cls).objects.get(pk=related_to_id)
@@ -33,4 +34,4 @@ def emit_activity(classname, actor_id, related_to_cls, related_to_id,
     else:
         organization = None
     cls.objects.create(actor=actor, related_to=related_to,
-                       organization=organization)
+                       organization=organization, extras=extras)

udata/core/dataset/factories.py CHANGED Viewed

@@ -34,10 +34,8 @@ class DatasetFactory(ModelFactory):
         nb_resources = 0
-class VisibleDatasetFactory(DatasetFactory):
-    @factory.lazy_attribute
-    def resources(self):
-        return [ResourceFactory()]
+class HiddenDatasetFactory(DatasetFactory):
+    private = True
 class ChecksumFactory(ModelFactory):

udata/core/dataset/models.py CHANGED Viewed

@@ -342,12 +342,10 @@ class License(db.Document):
 class DatasetQuerySet(db.OwnedQuerySet):
     def visible(self):
-        return self(private__ne=True, resources__0__exists=True,
-                    deleted=None, archived=None)
+        return self(private__ne=True, deleted=None, archived=None)
     def hidden(self):
         return self(db.Q(private=True) |
-                    db.Q(resources__0__exists=False) |
                     db.Q(deleted__ne=None) |
                     db.Q(archived__ne=None))
@@ -677,8 +675,7 @@ class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
     @property
     def is_hidden(self):
-        return (len(self.resources) == 0 or self.private or self.deleted
-                or self.archived)
+        return self.private or self.deleted or self.archived
     @property
     def full_title(self):

udata/core/dataset/rdf.py CHANGED Viewed

@@ -7,6 +7,7 @@ import logging
 from datetime import date
 from html.parser import HTMLParser
+from typing import Optional
 from dateutil.parser import parse as parse_dt
 from flask import current_app
 from geomet import wkt
@@ -339,36 +340,51 @@ def contact_point_from_rdf(rdf, dataset):
 def spatial_from_rdf(graph):
+    geojsons = []
     for term in graph.objects(DCT.spatial):
-        for object in term.objects():
-            if isinstance(object, Literal):
-                if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
-                    try:
-                        geojson = json.loads(object.toPython())
-                    except ValueError as e:
-                        log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
-                        continue
-                elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
-                    try:
-                        # .upper() si here because geomet doesn't support Polygon but only POLYGON
-                        geojson = wkt.loads(object.toPython().strip().upper())
-                    except ValueError as e:
-                        log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
+        try:
+            # This may not be official in the norm but some ArcGis return
+            # bbox as literal directly in DCT.spatial.
+            if isinstance(term, Literal):
+                geojson = bbox_to_geojson_multipolygon(term.toPython())
+                if geojson is not None:
+                    geojsons.append(geojson)
+                continue
+            for object in term.objects():
+                if isinstance(object, Literal):
+                    if object.datatype.__str__() == 'https://www.iana.org/assignments/media-types/application/vnd.geo+json':
+                        try:
+                            geojson = json.loads(object.toPython())
+                        except ValueError as e:
+                            log.warning(f"Invalid JSON in spatial GeoJSON {object.toPython()} {e}")
+                            continue
+                    elif object.datatype.__str__() == 'http://www.opengis.net/rdf#wktLiteral':
+                        try:
+                            # .upper() si here because geomet doesn't support Polygon but only POLYGON
+                            geojson = wkt.loads(object.toPython().strip().upper())
+                        except ValueError as e:
+                            log.warning(f"Invalid JSON in spatial WKT {object.toPython()} {e}")
+                            continue
+                    else:
                         continue
-                else:
-                    continue
-                if geojson['type'] == 'Polygon':
-                    geojson['type'] = 'MultiPolygon'
-                    geojson['coordinates'] = [geojson['coordinates']]
+                    if geojson['type'] == 'Polygon':
+                        geojson['type'] = 'MultiPolygon'
+                        geojson['coordinates'] = [geojson['coordinates']]
-                spatial_coverage = SpatialCoverage(geom=geojson)
+                    geojsons.append(geojson)
+        except Exception as e:
+            log.exception(f"Exception during `spatial_from_rdf` for term {term}: {e}", stack_info=True)
-                try:
-                    spatial_coverage.clean()
-                    return spatial_coverage
-                except ValidationError:
-                    continue
+    for geojson in geojsons:
+        spatial_coverage = SpatialCoverage(geom=geojson)
+        try:
+            spatial_coverage.clean()
+            return spatial_coverage
+        except ValidationError:
+            continue
     return None
@@ -609,3 +625,27 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
     dataset.harvest.modified_at = modified_at
     return dataset
+def bbox_to_geojson_multipolygon(bbox_as_str: str) -> Optional[dict] :
+    bbox = bbox_as_str.strip().split(',')
+    if len(bbox) != 4:
+        return None
+    west = float(bbox[0])
+    south = float(bbox[1])
+    east = float(bbox[2])
+    north = float(bbox[3])
+    low_left = [west, south]
+    top_left = [west, north]
+    top_right = [east, north]
+    low_right = [east, south]
+    return {
+        'type': 'MultiPolygon',
+        'coordinates': [
+            [
+                [low_left, low_right, top_right, top_left, low_left],
+            ],
+        ],
+    }

udata/core/dataset/search.py CHANGED Viewed

@@ -47,7 +47,6 @@ class DatasetSearch(ModelSearchAdapter):
     @classmethod
     def is_indexable(cls, dataset):
         return (dataset.deleted is None and dataset.archived is None and
-                len(dataset.resources) > 0 and
                 not dataset.private)
     @classmethod

udata/core/spatial/tests/test_api.py CHANGED Viewed

@@ -1,15 +1,12 @@
 from flask import url_for
-from udata.utils import get_by
 from udata.utils import faker
 from udata.tests.api import APITestCase
 from udata.tests.features.territories import (
     create_geozones_fixtures, TerritoriesSettings
 )
-from udata.tests.helpers import assert_json_equal
 from udata.core.organization.factories import OrganizationFactory
-from udata.core.dataset.factories import VisibleDatasetFactory
+from udata.core.dataset.factories import DatasetFactory
 from udata.core.spatial.factories import (
     SpatialCoverageFactory, GeoZoneFactory, GeoLevelFactory
 )
@@ -173,7 +170,7 @@ class SpatialApiTest(APITestCase):
         paca, bdr, arles = create_geozones_fixtures()
         organization = OrganizationFactory()
         for _ in range(3):
-            VisibleDatasetFactory(
+            DatasetFactory(
                 organization=organization,
                 spatial=SpatialCoverageFactory(zones=[paca.id]))
@@ -185,7 +182,7 @@ class SpatialApiTest(APITestCase):
         paca, bdr, arles = create_geozones_fixtures()
         organization = OrganizationFactory()
         for _ in range(3):
-            VisibleDatasetFactory(
+            DatasetFactory(
                 organization=organization,
                 spatial=SpatialCoverageFactory(zones=[paca.id]))
@@ -198,7 +195,7 @@ class SpatialApiTest(APITestCase):
         paca, bdr, arles = create_geozones_fixtures()
         organization = OrganizationFactory()
         for _ in range(3):
-            VisibleDatasetFactory(
+            DatasetFactory(
                 organization=organization,
                 spatial=SpatialCoverageFactory(zones=[paca.id]))
@@ -212,7 +209,7 @@ class SpatialApiTest(APITestCase):
         paca, bdr, arles = create_geozones_fixtures()
         organization = OrganizationFactory()
         for _ in range(3):
-            VisibleDatasetFactory(
+            DatasetFactory(
                 organization=organization,
                 spatial=SpatialCoverageFactory(zones=[paca.id]))
@@ -241,7 +238,7 @@ class SpatialTerritoriesApiTest(APITestCase):
         paca, bdr, arles = create_geozones_fixtures()
         organization = OrganizationFactory()
         for _ in range(3):
-            VisibleDatasetFactory(
+            DatasetFactory(
                 organization=organization,
                 spatial=SpatialCoverageFactory(zones=[paca.id]))
@@ -255,7 +252,7 @@ class SpatialTerritoriesApiTest(APITestCase):
         paca, bdr, arles = create_geozones_fixtures()
         organization = OrganizationFactory()
         for _ in range(3):
-            VisibleDatasetFactory(
+            DatasetFactory(
                 organization=organization,
                 spatial=SpatialCoverageFactory(zones=[paca.id]))

udata/core/topic/factories.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import factory
 from udata import utils
-from udata.core.dataset.factories import VisibleDatasetFactory
+from udata.core.dataset.factories import DatasetFactory
 from udata.core.reuse.factories import VisibleReuseFactory
 from udata.factories import ModelFactory
@@ -19,7 +19,7 @@ class TopicFactory(ModelFactory):
     @factory.lazy_attribute
     def datasets(self):
-        return VisibleDatasetFactory.create_batch(3)
+        return DatasetFactory.create_batch(3)
     @factory.lazy_attribute
     def reuses(self):

udata/harvest/backends/dcat.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 from rdflib import Graph, URIRef
 from rdflib.namespace import RDF
-import xml.etree.ElementTree as ET
+import lxml.etree as ET
 import boto3
 from flask import current_app
 from datetime import date
@@ -173,7 +173,36 @@ class DcatBackend(BaseBackend):
         dataset = self.get_dataset(item.remote_id)
         dataset = dataset_from_rdf(graph, dataset, node=node)
         return dataset
+    def next_record_if_should_continue(self, start, search_results):
+        next_record = int(search_results.attrib['nextRecord'])
+        matched_count = int(search_results.attrib['numberOfRecordsMatched'])
+        returned_count = int(search_results.attrib['numberOfRecordsReturned'])
+        # Break conditions copied gratefully from
+        # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
+        break_conditions = (
+            # standard CSW: A value of 0 means all records have been returned.
+            next_record == 0,
+            # Misbehaving CSW server returning a next record > matched count
+            next_record > matched_count,
+            # No results returned already
+            returned_count == 0,
+            # Current next record is lower than previous one
+            next_record < start,
+            # Enough items have been harvested already
+            self.max_items and len(self.job.items) >= self.max_items
+        )
+        if any(break_conditions):
+            return None
+        else:
+            return next_record
 class CswDcatBackend(DcatBackend):
     display_name = 'CSW-DCAT'
@@ -201,17 +230,18 @@ class CswDcatBackend(DcatBackend):
         graphs = []
         page = 0
         start = 1
         response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
                              headers=headers)
         response.raise_for_status()
-        content = response.text
+        content = response.content
         tree = ET.fromstring(content)
         if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
             raise ValueError(f'Failed to query CSW:\n{content}')
         while tree:
             graph = Graph(namespace_manager=namespace_manager)
             search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
-            if not search_results:
+            if search_results is None:
                 log.error(f'No search results found for {url} on page {page}')
                 break
             for child in search_results:
@@ -225,37 +255,111 @@ class CswDcatBackend(DcatBackend):
                     kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
                     self.add_item(id, **kwargs)
             graphs.append(graph)
+            next_record = self.next_record_if_should_continue(start, search_results)
+            if not next_record:
+                break
+            start = next_record
             page += 1
-            next_record = int(search_results.attrib['nextRecord'])
-            matched_count = int(search_results.attrib['numberOfRecordsMatched'])
-            returned_count = int(search_results.attrib['numberOfRecordsReturned'])
+            tree = ET.fromstring(
+                self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
+                          headers=headers).content)
+        return graphs
+class CswIso19139DcatBackend(DcatBackend):
+    '''
+    An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
+    The parsing of items is then the same as for the DcatBackend.
+    '''
+    display_name = 'CSW-ISO-19139'
+    ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
-            # Break conditions copied gratefully from
-            # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
-            break_conditions = (
-                # standard CSW: A value of 0 means all records have been returned.
-                next_record == 0,
+    XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
-                # Misbehaving CSW server returning a next record > matched count
-                next_record > matched_count,
+    def parse_graph(self, url: str, fmt: str) -> List[Graph]:
+        '''
+        Parse CSW graph querying ISO schema.
+        Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
+        See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
+        '''
+        # Load XSLT
+        xsl = ET.fromstring(self.get(self.XSL_URL).content)
+        transform = ET.XSLT(xsl)
+        # Start querying and parsing graph
+        body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
+                                  xmlns:gmd="http://www.isotc211.org/2005/gmd"
+                                  service="CSW" version="2.0.2" resultType="results"
+                                  startPosition="{start}" maxPosition="10"
+                                  outputSchema="{schema}">
+                      <csw:Query typeNames="csw:Record">
+                        <csw:ElementSetName>full</csw:ElementSetName>
+                        <csw:Constraint version="1.1.0">
+                            <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
+                                <ogc:PropertyIsEqualTo>
+                                    <ogc:PropertyName>dc:type</ogc:PropertyName>
+                                    <ogc:Literal>dataset</ogc:Literal>
+                                </ogc:PropertyIsEqualTo>
+                            </ogc:Filter>
+                        </csw:Constraint>
+                    </csw:Query>
+                </csw:GetRecords>'''
+        headers = {'Content-Type': 'application/xml'}
-                # No results returned already
-                returned_count == 0,
+        graphs = []
+        page = 0
+        start = 1
-                # Current next record is lower than previous one
-                next_record < start,
+        response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
+                            headers=headers)
+        response.raise_for_status()
-                # Enough items have been harvested already
-                self.max_items and len(self.job.items) >= self.max_items
-            )
+        tree_before_transform = ET.fromstring(response.content)
+        # Disabling CoupledResourceLookUp to prevent failure on xlink:href
+        # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
+        tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
-            if any(break_conditions):
+        while tree:
+            # We query the tree before the transformation because the XSLT remove the search results
+            # infos (useful for pagination)
+            search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
+            if search_results is None:
+                log.error(f'No search results found for {url} on page {page}')
                 break
+            subgraph = Graph(namespace_manager=namespace_manager)
+            subgraph.parse(ET.tostring(tree), format=fmt)
+            if not subgraph.subjects(RDF.type, DCAT.Dataset):
+                raise ValueError("Failed to fetch CSW content")
+            for node in subgraph.subjects(RDF.type, DCAT.Dataset):
+                id = subgraph.value(node, DCT.identifier)
+                kwargs = {'nid': str(node), 'page': page}
+                kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
+                self.add_item(id, **kwargs)
+            graphs.append(subgraph)
+            next_record = self.next_record_if_should_continue(start, search_results)
+            if not next_record:
+                break
             start = next_record
-            tree = ET.fromstring(
-                self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
-                          headers=headers).text)
+            page += 1
+            response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
+                          headers=headers)
+            response.raise_for_status()
+            tree_before_transform = ET.fromstring(response.content)
+            tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
         return graphs

udata 7.0.5.dev28172__py2.py3-none-any.whl → 7.0.6__py2.py3-none-any.whl

Potentially problematic release.

udata 7.0.5.dev28172py2.py3-none-any.whl → 7.0.6py2.py3-none-any.whl