PyPI - OneStop4All-Indexer - Versions diffs - 2.8.0.dev12__tar.gz → 2.8.0.dev14__tar.gz - Mend

OneStop4All-Indexer 2.8.0.dev12tar.gz → 2.8.0.dev14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14/OneStop4All_Indexer.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: OneStop4All-Indexer
-Version: 2.8.0.dev12
+Version: 2.8.0.dev14
 Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
 Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
 Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/OneStop4All_Indexer.egg-info/SOURCES.txt RENAMED Viewed

@@ -24,7 +24,7 @@ harvesters/harvester_metadatastandards.py
 harvesters/harvester_organization.py
 harvesters/harvester_repository.py
 harvesters/harvester_service.py
-harvesters/harvester_softwaresourcecode.py
+harvesters/harvester_software.py
 utils/__init__.py
 utils/cli.py
 utils/configs.py

{onestop4all_indexer-2.8.0.dev12/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: OneStop4All-Indexer
-Version: 2.8.0.dev12
+Version: 2.8.0.dev14
 Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
 Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
 Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from .harvester_repository import *
 from .harvester_organization import *
 from .harvester_article import *
-from .harvester_softwaresourcecode import *
+from .harvester_software import *
 from .harvester_learningresource import *
 from .harvester_metadatastandards import *
 from .harvester_document import *

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_organization.py RENAMED Viewed

@@ -88,11 +88,34 @@ class Organization_Harvester(Harvester):
         )
         organization_list = []
-        for (
-            key,
-            value,
-        ) in organizations.items():  # transform orga dict to list for indexing
+        for key, value in organizations.items():  # transform orga dict to list for indexing
             organization = value
+            # clean geometry: if more than one geometry is present,
+            # keep only the first one (should not happen, but just in case)
+            geom = organization.get("geometry", [])
+            if len(geom) > 1:
+                organization["geometry"] = geom[:1]
+            # clean rorId: if more than one rorId is present,
+            # keep only unique ones (should not happen, but just in case)
+            rorid = organization.get("rorId", [])
+            if len(rorid) > 1:
+                organization["rorId"] = list(set(rorid))
+            # Transform locations
+            localities = organization.get("locality", [])
+            countries = organization.get("countryName", [])
+            complete_locations = []
+            for loc, country in zip(localities, countries):
+                complete_locations.append(f"{loc}, {country}")
+            # Filter duplication
+            organization["location"] = list(dict.fromkeys(complete_locations))
+            organization["locality"] = list(dict.fromkeys(localities))
+            organization["countryName"] = list(dict.fromkeys(countries))
             # ensure mainTitle
             if (
                 "mainTitle" not in organization
@@ -137,9 +160,23 @@ class Organization_Harvester(Harvester):
                     assignto_dict[subject[0]][attribute] = []
                 assignto_dict[subject[0]][attribute].extend(organization_name)
+    def does_object_exist(self, value, attribute: str, data: dict):
+        return attribute in data and value in data[attribute]
     def parse_response(
         self, hits, organizations, issuborganization, hasN4Econtact
     ):
+        PREDICATES = {
+            "http://xmlns.com/foaf/0.1/homepage": "homepage",
+            "http://www.w3.org/2002/07/owl#sameAs": "sameAs",
+            "http://w3id.org/nfdi4ing/metadata4ing#hasRorId": "rorId",
+            "http://www.w3.org/2004/02/skos/core#altLabel": "altLabel",
+            "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": "type",
+            "http://nfdi4earth.de/ontology/sourceSystemURL": "sourceSystemURL",
+            "http://nfdi4earth.de/ontology/hasSignedCommitment": "hasSignedCommitment",
+            "http://nfdi4earth.de/ontology/sourceSystemID": "sourceSystem" + self.flatten_separator + "id"
+        }
         for hit in hits:
             subject = hit["subject"]["value"]
             predicate = hit["predicate"]["value"]
@@ -151,6 +188,8 @@ class Organization_Harvester(Harvester):
                 organizations[subject]["id"] = self.getID(
                     subject
                 )  # use ID from triple store also in Solr to ensure stable IDs
+                organizations[subject]["locality"] = []
+                organizations[subject]["countryName"] = []
             # set geometry if available and not already set
             if (
@@ -179,8 +218,9 @@ class Organization_Harvester(Harvester):
             if predicate == "http://schema.org/name":  # name
                 if (
-                    "xml:lang" not in hit["object"]
-                    or hit["object"]["xml:lang"] == "en"
+                    ("xml:lang" not in hit["object"]
+                    or hit["object"]["xml:lang"] == "en")
+                    and not self.does_object_exist(object, "name", organizations[subject])
                 ):  # use international name for orga name
                     self.addValue(
                         dict=organizations[subject],
@@ -189,60 +229,30 @@ class Organization_Harvester(Harvester):
                     )
                     organizations[subject]["mainTitle"] = object  # mainTitle
                 if (
-                    "name_alt" not in organizations[subject]
-                    or object not in organizations[subject]["name_alt"]
+                    not self.does_object_exist(object, "name_alt", organizations[subject])
                 ):  # prevent duplicates
                     self.addValue(
                         dict=organizations[subject],
                         attribute="name_alt",
                         value=object,
                     )
-            elif predicate == "http://xmlns.com/foaf/0.1/homepage":  # homepage
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="homepage",
-                    value=object,
-                )
-            elif (
-                predicate == "http://www.w3.org/2006/vcard/ns#locality"
-            ):  # locality
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="locality",
-                    value=object,
-                )
-            elif (
-                predicate == "http://www.w3.org/2006/vcard/ns#country-name"
-            ):  # countryName
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="countryName",
-                    value=object,
-                )
-            elif predicate == "http://www.w3.org/2002/07/owl#sameAs":  # sameAs
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="sameAs",
-                    value=object,
-                )
-            elif (
-                predicate == "http://w3id.org/nfdi4ing/metadata4ing#hasRorId"
-            ):  # rorId
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="rorId",
-                    value=object,
-                )
-            elif (
-                predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
-            ):  # altLabel
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="altLabel",
-                    value=object,
-                )
+            elif predicate == "http://www.w3.org/2006/vcard/ns#locality":
+                organizations[subject]["locality"].append(object)
+            elif predicate == "http://www.w3.org/2006/vcard/ns#country-name":
+                organizations[subject]["countryName"].append(object)
+            elif predicate in PREDICATES:
+                attribute = PREDICATES[predicate]
+                if not self.does_object_exist(object, attribute, organizations[subject]):
+                    self.addValue(
+                        dict=organizations[subject],
+                        attribute=attribute,
+                        value=object)
             elif (
                 predicate == "http://www.w3.org/ns/org#subOrganizationOf"
+                and (subject, object) not in issuborganization
             ):  # subOrganizationOf
                 issuborganization.append(
                     (subject, object)
@@ -250,60 +260,8 @@ class Organization_Harvester(Harvester):
             elif (
                 predicate
                 == "http://nfdi4earth.de/ontology/hasNFDI4EarthContactPerson"
+                and (subject, object) not in hasN4Econtact
             ):  # NFDI4EarthContactPerson
                 hasN4Econtact.append(
                     (subject, object)
                 )  # store, resolve contact info later
-            elif (
-                predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
-            ):  # type
-                self.addValue(
-                    dict=organizations[subject], attribute="type", value=object
-                )
-            elif (
-                predicate == "http://nfdi4earth.de/ontology/sourceSystem"
-            ):  # sourceSystem
-                if "sourceSystem_homepage" in hit:
-                    self.addValue(
-                        organizations[subject],
-                        "sourceSystem" + self.flatten_separator + "homepage",
-                        hit["sourceSystem_homepage"]["value"],
-                    )
-                if "sourceSystem_title" in hit:
-                    self.addValue(
-                        organizations[subject],
-                        "sourceSystem" + self.flatten_separator + "title",
-                        hit["sourceSystem_title"]["value"],
-                    )
-            elif (
-                predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
-            ):  # sourceSystemID
-                if (
-                    "sourceSystem" + self.flatten_separator + "id"
-                    not in organizations[subject]
-                ):
-                    # only set if not already present
-                    self.addValue(
-                        dict=organizations[subject],
-                        attribute="sourceSystem"
-                        + self.flatten_separator
-                        + "id",
-                        value=object,
-                    )
-            elif (
-                predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
-            ):  # sourceSystemURL
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="sourceSystemURL",
-                    value=object,
-                )
-            elif (
-                predicate
-                == "http://nfdi4earth.de/ontology/hasSignedCommitment"
-            ):  # hasSignedCommitment
-                self.addValue(
-                    dict=organizations[subject],
-                    attribute="hasSignedCommitment",
-                    value=object,
-                )

onestop4all_indexer-2.8.0.dev14/harvesters/harvester_service.py ADDED Viewed

@@ -0,0 +1,551 @@
+import logging
+from .harvester_base import Harvester
+from utils import sparql
+from data_repositories.repository_n4eorganization import (
+    RepositoryN4EOrganization,
+)
+log = logging.getLogger(__name__)
+# harvester for Services https://nfdi4earth.pages.rwth-aachen.de/knowledgehub/nfdi4earth-kh-schema/Service/
+class Service_Harvester(Harvester):
+    sparql_query = """
+        PREFIX fo: <http://www.w3.org/1999/XSL/Format#>
+        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+        PREFIX dc: <http://purl.org/dc/elements/1.1/>
+        PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
+        prefix dcat: <http://www.w3.org/ns/dcat#>
+        prefix dct: <http://purl.org/dc/terms/>
+        prefix n4e: <http://nfdi4earth.de/ontology/>
+        PREFIX m4i: <http://w3id.org/nfdi4ing/metadata4ing#>
+        PREFIX geo: <http://www.opengis.net/ont/geosparql#>
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        PREFIX schema: <http://schema.org/>
+        SELECT ?subject ?predicate ?object
+            ?contactpoint_email
+            ?contactpoint_url
+            ?contact_fn
+            ?contact_email
+            ?serviceProvider_homepage ?serviceProvider_imprint ?serviceProvider_rorId ?serviceProvider_name
+            ?serviceLocationPoint
+            ?tangibleKPI_kpiType ?tangibleKPI_kpiValue ?tangibleKPI_kpiNotes
+        WHERE {
+            {
+                # Page over distinct subjects first — avoids ORDER BY + OFFSET on the full
+                # UNION result set which forces full materialization and causes timeouts.
+                SELECT DISTINCT ?subject
+                WHERE {
+                    ?subject rdf:type n4e:Service .
+                }
+                OFFSET %d
+                LIMIT %d
+            }
+            {
+                ?subject ?predicate ?object .
+                FILTER (?predicate NOT IN (dcat:contactPoint, n4e:sourceSystem))
+            }
+            UNION {
+                VALUES ?predicate { dcat:contactPoint }
+                ?subject dcat:contactPoint ?object .
+                OPTIONAL { ?object vcard:hasEmail ?contactpoint_email . }
+                OPTIONAL { ?object vcard:hasURL ?contactpoint_url . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:firstLevelSupportContact }
+                ?subject n4e:firstLevelSupportContact ?object .
+                OPTIONAL { ?object vcard:fn ?contact_fn . }
+                OPTIONAL { ?object vcard:hasEmail ?contact_email . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:securityIncidentContact }
+                ?subject n4e:securityIncidentContact ?object .
+                OPTIONAL { ?object vcard:fn ?contact_fn . }
+                OPTIONAL { ?object vcard:hasEmail ?contact_email . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:serviceOwner }
+                ?subject n4e:serviceOwner ?object .
+                OPTIONAL { ?object vcard:fn ?contact_fn . }
+                OPTIONAL { ?object vcard:hasEmail ?contact_email . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:serviceManager }
+                ?subject n4e:serviceManager ?object .
+                OPTIONAL { ?object vcard:fn ?contact_fn . }
+                OPTIONAL { ?object vcard:hasEmail ?contact_email . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:serviceProvider }
+                ?subject n4e:serviceProvider ?object .
+                OPTIONAL { ?object foaf:homepage ?serviceProvider_homepage . }
+                OPTIONAL { ?object n4e:hasImprint ?serviceProvider_imprint . }
+                OPTIONAL { ?object m4i:hasRorId ?serviceProvider_rorId . }
+                OPTIONAL { ?object schema:name ?serviceProvider_name . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:serviceLocation }
+                ?subject n4e:serviceLocation ?object .
+                OPTIONAL { ?object geo:asWKT ?serviceLocationPoint . }
+            }
+            UNION {
+                VALUES ?predicate { n4e:tangibleKPI }
+                ?subject n4e:tangibleKPI ?object .
+                OPTIONAL { ?object n4e:kpiType ?tangibleKPI_kpiType . }
+                OPTIONAL { ?object n4e:kpiValue ?tangibleKPI_kpiValue . }
+                OPTIONAL { ?object n4e:kpiNotes ?tangibleKPI_kpiNotes . }
+            }
+        }
+    """
+    def __init__(
+        self, n4e_organizations_repo: RepositoryN4EOrganization, **kw
+    ):
+        super().__init__(**kw)
+        self.n4e_organizations_repo = n4e_organizations_repo
+    def harvest(self):
+        limit = 5000
+        # convert to list of repo documents for indexing
+        services = {}  # repos dict
+        i = 0
+        # split sparql query by paging over distinct subjects
+        # (sub-SELECT OFFSET/LIMIT)
+        while True:
+            query_splitted = self.sparql_query % (limit * i, limit)
+            hits = sparql.execute_query(self.sparql_endpoint, query_splitted)
+            subjects_before = len(services)
+            self.parse_response(hits, services)
+            new_subjects = len(services) - subjects_before
+            i += 1
+            # Stop when the sub-SELECT returned fewer subjects than the page size
+            if new_subjects < limit:
+                break
+        services_list = []
+        for (
+            key,
+            value,
+        ) in services.items():  # transform repos dict to list for indexing
+            service = value
+            if "mainTitle" not in service and len(service["name"]) > 0:
+                service["mainTitle"] = service["name"][0]
+            service["mainTitle"] = service["mainTitle"].strip()
+            services_list.append(service)
+        return services_list
+    def parse_response(
+            self, hits, services
+        ):
+            for hit in hits:
+                subject = hit["subject"]["value"]
+                predicate = hit["predicate"]["value"]
+                object = hit["object"]["value"]
+                if subject not in services:
+                    services[subject] = {}
+                    services[subject]["uri"] = subject
+                    services[subject]["id"] = self.getID(
+                        subject
+                    )  # use ID from triple store also in Solr to ensure stable IDs
+                if predicate == "http://schema.org/name":  # name
+                    if (
+                        "xml:lang" not in hit["object"]
+                        or hit["object"]["xml:lang"] == "en"
+                    ):  # use international name for orga name
+                        self.addValue(
+                            dict=services[subject],
+                            attribute="name",
+                            value=object,
+                        )
+                        services[subject]["mainTitle"] = object  # mainTitle
+                    if (
+                        "name" not in services[subject]
+                        or object not in services[subject]["name"]
+                    ):  # prevent duplicates
+                        self.addValue(
+                            dict=services[subject],
+                            attribute="name",
+                            value=object,
+                        )
+                elif predicate == "http://schema.org/description":  # description
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="description",
+                        value=object,
+                    )
+                elif predicate == "http://schema.org/additionalType":  # additionalType
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="additionalType",
+                        value=object,
+                    )
+                elif predicate == "http://schema.org/keywords":  # keyword
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="keyword",
+                        value=object,
+                    )
+                elif predicate == "http://schema.org/url":  # url
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="url",
+                        value=object,
+                    )
+                elif predicate == "http://nfdi4earth.de/ontology/serviceType":  # serviceType
+                    services[subject]["serviceType"] = object
+                elif predicate == "http://nfdi4earth.de/ontology/serviceHost":  # serviceHost
+                    services[subject]["serviceHost"] = object
+                elif predicate == "http://www.w3.org/ns/dcat#contactPoint":  # contactPoint
+                    if "contactpoint_email" in hit:
+                        self.addValue(
+                            services[subject],
+                            "contactPoint" + self.flatten_separator + "email",
+                            hit["contactpoint_email"]["value"],
+                        )
+                    if "contactpoint_url" in hit:
+                        self.addValue(
+                            services[subject],
+                            "contactPoint" + self.flatten_separator + "url",
+                            hit["contactpoint_url"]["value"],
+                        )
+                elif predicate == "http://nfdi4earth.de/ontology/firstLevelSupportContact":
+                    if "contact_fn" in hit:
+                        self.addValue(
+                            services[subject],
+                            "firstLevelSupportContact" + self.flatten_separator + "fullname",
+                            hit["contact_fn"]["value"],
+                        )
+                    if "contact_email" in hit:
+                        self.addValue(
+                            services[subject],
+                            "firstLevelSupportContact" + self.flatten_separator + "hasEmail",
+                            hit["contact_email"]["value"],
+                        )
+                elif predicate == "http://nfdi4earth.de/ontology/serviceOwner":
+                    if "contact_fn" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceOwner" + self.flatten_separator + "fullname",
+                            hit["contact_fn"]["value"],
+                        )
+                    if "contact_email" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceOwner" + self.flatten_separator + "hasEmail",
+                            hit["contact_email"]["value"],
+                        )
+                elif predicate == "http://nfdi4earth.de/ontology/serviceManager":
+                    if "contact_fn" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceManager" + self.flatten_separator + "fullname",
+                            hit["contact_fn"]["value"],
+                        )
+                    if "contact_email" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceManager" + self.flatten_separator + "hasEmail",
+                            hit["contact_email"]["value"],
+                        )
+                elif predicate == "http://nfdi4earth.de/ontology/serviceProvider":
+                    if "serviceProvider_homepage" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceProvider" + self.flatten_separator + "homepage",
+                            hit["serviceProvider_homepage"]["value"],
+                        )
+                    if "serviceProvider_imprint" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceProvider" + self.flatten_separator + "imprint",
+                            hit["serviceProvider_imprint"]["value"],
+                        )
+                    if "serviceProvider_rorId" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceProvider" + self.flatten_separator + "rorId",
+                            hit["serviceProvider_rorId"]["value"],
+                        )
+                    if "serviceProvider_name" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceProvider" + self.flatten_separator + "name",
+                            hit["serviceProvider_name"]["value"],
+                        )
+                elif predicate == "http://nfdi4earth.de/ontology/serviceLocation":
+                    if "serviceLocationPoint" in hit:
+                        self.addValue(
+                            services[subject],
+                            "serviceLocationPoint",
+                            hit["serviceLocationPoint"]["value"],
+                        )
+                elif predicate == "http://nfdi4earth.de/ontology/securityIncidentContact":
+                    if "contact_fn" in hit:
+                        self.addValue(
+                            services[subject],
+                            "securityIncidentContact" + self.flatten_separator + "fullname",
+                            hit["contact_fn"]["value"],
+                        )
+                    if "contact_email" in hit:
+                        self.addValue(
+                            services[subject],
+                            "securityIncidentContact" + self.flatten_separator + "hasEmail",
+                            hit["contact_email"]["value"],
+                        )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
+                ):  # sourceSystemID
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="sourceSystem" + self.flatten_separator + "id",
+                        value=object,
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/sourceSystem"
+                ):  # sourceSystem
+                    if "sourceSystem_homepage" in hit:
+                        self.addValue(
+                            services[subject],
+                            "sourceSystem" + self.flatten_separator + "homepage",
+                            hit["sourceSystem_homepage"]["value"],
+                        )
+                    if "sourceSystem_title" in hit:
+                        self.addValue(
+                            services[subject],
+                            "sourceSystem" + self.flatten_separator + "title",
+                            hit["sourceSystem_title"]["value"],
+                        )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
+                ):  # sourceSystemURL
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="sourceSystemURL",
+                        value=object,
+                    )
+                elif (
+                    predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
+                ):  # altLabel
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="altLabel",
+                        value=object,
+                    )
+                elif (
+                    predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
+                ):  # type
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="type",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/serviceType"
+                ):  # serviceType
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="serviceType",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/serviceCategory"
+                ):  # serviceCategory
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="serviceCategory",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/linkToDocumentation"
+                ):  # linkToDocumentation
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="linkToDocumentation",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/nameAbbreviation"
+                ):  # nameAbbreviation
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="nameAbbreviation",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/shortDescription"
+                ):  # shortDescription
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="shortDescription",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/chargeFree"
+                ):  # chargeFree //BOOLEAN
+                    if object == "1":
+                        services[subject]["chargeFree"] = True
+                    elif object == "0":
+                        services[subject]["chargeFree"] = False
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/nonProfit"
+                ):  # nonProfit //BOOLEAN
+                    if object == "1":
+                        services[subject]["nonProfit"] = True
+                    elif object == "0":
+                        services[subject]["nonProfit"] = False
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/adFree"
+                ):  # adFree //BOOLEAN
+                    if object == "1":
+                        services[subject]["adFree"] = True
+                    elif object == "0":
+                        services[subject]["adFree"] = False
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/fees"
+                ):  # fees
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="fees",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/serviceAccessType"
+                ):  # serviceAccessType
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="serviceAccessType",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/logo"
+                ):  # logo
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="logo",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/userEnablement"
+                ):  # userEnablement
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="userEnablement",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/serviceEnablement"
+                ):  # serviceEnablement
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="serviceEnablement",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/personalDataProcessingAndStorage"
+                ):  # personalDataProcessingAndStorage
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="personalDataProcessingAndStorage",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/dataProtectionAndBackup"
+                ):  # dataProtectionAndBackup
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="dataProtectionAndBackup",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/securityIncidentContact"
+                ):  # securityIncidentContact  //ndoID
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="securityIncidentContact",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/servicePrivacyPolicy"
+                ):  # servicePrivacyPolicy
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="servicePrivacyPolicy",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/businessModel"
+                ):  # businessModel
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="businessModel",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/GDPRCompliant"
+                ):  # GDPRCompliant //BOOLEAN
+                    if object == "1":
+                        services[subject]["GDPRCompliant"] = True
+                    elif object == "0":
+                        services[subject]["GDPRCompliant"] = False
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/servicePublicationConsent"
+                ):  # servicePublicationConsent //BOOLEAN
+                    if object == "1":
+                        services[subject]["servicePublicationConsent"] = True
+                    elif object == "0":
+                        services[subject]["servicePublicationConsent"] = False
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/contactWithPortfolioManagement"
+                ):  # contactWithPortfolioManagement
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="contactWithPortfolioManagement",
+                        value=object
+                    )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/limitations"
+                ):  # limitations
+                    self.addValue(
+                        dict=services[subject],
+                        attribute="limitations",
+                        value=object
+                    )
+                elif predicate == "http://nfdi4earth.de/ontology/tangibleKPI":
+                    if "tangibleKPI_kpiType" in hit:
+                        self.addValue(
+                            services[subject],
+                            "tangibleKPI" + self.flatten_separator + "kpiType",
+                            hit["tangibleKPI_kpiType"]["value"],
+                        )
+                    if "tangibleKPI_kpiValue" in hit:
+                        self.addValue(
+                            services[subject],
+                            "tangibleKPI" + self.flatten_separator + "kpiValue",
+                            hit["tangibleKPI_kpiValue"]["value"],
+                        )
+                    if "tangibleKPI_kpiNotes" in hit:
+                        self.addValue(
+                            services[subject],
+                            "tangibleKPI" + self.flatten_separator + "kpiNotes",
+                            hit["tangibleKPI_kpiNotes"]["value"],
+                        )
+                elif (
+                    predicate == "http://nfdi4earth.de/ontology/idHostingInstitution"
+                ): #idHostingInstitution #hostingInstitution_name  #isN4EOperated
+                    host_rorID = object
+                    services[subject]["idHostingInstitution"] = host_rorID #idHostingInstitution
+                    n4e_organization = self.n4e_organizations_repo.get_n4e_organization_by_rorID(host_rorID) #check if rorID belongs to a organization that is n4e member
+                    if n4e_organization is not None:
+                        services[subject]["isN4EOperated"] = True #isN4EOperated
+                        services[subject]["hostingInstitution_name"] = n4e_organization["name"] #hostingInstitution_name  currently only for n4e operated services
+                        #only index if `True`, do not index if `False`

onestop4all_indexer-2.8.0.dev12/harvesters/harvester_softwaresourcecode.py → onestop4all_indexer-2.8.0.dev14/harvesters/harvester_software.py RENAMED Viewed

@@ -7,7 +7,7 @@ from utils import sparql
 log = logging.getLogger(__name__)
-class Softwaresourcecode_Harvester(Harvester):
+class Software_Harvester(Harvester):
     sparql_query = """
         PREFIX foaf: <http://xmlns.com/foaf/0.1/>
         prefix n4e: <http://nfdi4earth.de/ontology/>
@@ -17,25 +17,25 @@ class Softwaresourcecode_Harvester(Harvester):
         SELECT ?subject ?predicate ?object ?sourceSystem_homepage ?sourceSystem_title
         WHERE {
         {
-        ?subject rdf:type <http://schema.org/SoftwareSourceCode>.
+        ?subject rdf:type n4e:Software.
             ?subject ?predicate ?object
             FILTER (?predicate NOT IN (<http://schema.org/publisher>, <http://schema.org/audience>))
         }
         UNION{
             VALUES  ?predicate { <http://schema.org/publisher> }
-            ?subject rdf:type <http://schema.org/SoftwareSourceCode>;
+            ?subject rdf:type n4e:Software;
                     <http://schema.org/publisher> ?publisher.
             ?publisher <http://schema.org/name> ?object.
         }
         UNION{
             VALUES  ?predicate { <http://schema.org/audience> }
-            ?subject rdf:type <http://schema.org/SoftwareSourceCode>;
+            ?subject rdf:type n4e:Software;
                     <http://schema.org/audience> ?audience.
             ?audience dct:title ?object.
         }
         UNION{
             VALUES  ?predicate { n4e:sourceSystem }
-            ?subject rdf:type <http://schema.org/SoftwareSourceCode>;
+            ?subject rdf:type n4e:Software;
                     n4e:sourceSystem ?object.
             optional {?object dct:title ?sourceSystem_title.}
             optional {?object  foaf:homepage ?sourceSystem_homepage.}

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
 setup(
     name="OneStop4All-Indexer",
-    version="2.8.0.dev12",
+    version="2.8.0.dev14",
     description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
     author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
     author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/cli.py RENAMED Viewed

@@ -55,7 +55,7 @@ def debug():
             "Organization",
             "Repository",
             "Service",
-            "Softwaresourcecode",
+            "Software",
         ],
         case_sensitive=False,
     ),

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/harvest.py RENAMED Viewed

@@ -16,7 +16,7 @@ from harvesters import (
     Repository_Harvester,
     Organization_Harvester,
     Article_Harvester,
-    Softwaresourcecode_Harvester,
+    Software_Harvester,
     Learningresource_Harvester,
     Metadatastandard_Harvester,
     Document_Harvester,
@@ -73,7 +73,7 @@ def get_harvester(
             get_repo("links_repo"),
             get_repo("themes_repo"),
         ),
-        "Softwaresourcecode": lambda: Softwaresourcecode_Harvester(
+        "Software": lambda: Software_Harvester(
             get_repo("persons_repo"),
             get_repo("links_repo"),
         ),

{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/solr.py RENAMED Viewed

@@ -11,6 +11,7 @@ log = logging.getLogger(__name__)
 class Solr(object):
     def __init__(
         self,
         solr_url: Optional[str] = None,
@@ -19,42 +20,29 @@ class Solr(object):
         always_commit: bool = False,
         timeout: int = 5 * 60,
     ) -> None:
-        self.solr_url = solr_url
-        self.solr_core = solr_core
-        self.auth = solr_auth
+        self.solr_url = solr_url if solr_url else config["solr_url"]
+        self.solr_core = solr_core if solr_url else config["solr_url"]
+        self.auth = solr_auth if solr_url else config["solr_url"]
         self.client = SolrClient(
             self.endpoint,
             auth=self.authentication,
             always_commit=always_commit,
             timeout=timeout,
         )
+        # test connection to solr endpoint
+        # -> raises exception if connection fails
+        self.client.ping()
     @property
     def endpoint(self):
-        # using config-values (by default) OR
-        # overwrite with initially given values
-        # TODO: check if endpoint is reachable, if not raise error
-        solr_url = self.solr_url if self.solr_url else config["solr_url"]
-        log.debug(f"configured solr url: {solr_url}")
-        if solr_url.startswith("http://"):
-            raise ValueError(
-                "Insecure solr url configured. "
-                "Please check your configuration and use https."
-            )
-        solr_core = self.solr_core if self.solr_core else config["solr_core"]
-        log.debug(f"configured solr core: {solr_core}")
-        _endpoint = urljoin(solr_url, solr_core)
-        log.info(f"initialized solr client with endpoint: {_endpoint}")
-        return _endpoint
+        endpoint = urljoin(self.solr_url, self.solr_core)
+        log.info(f"initialized solr client with endpoint: {endpoint}")
+        return endpoint
     @property
     def authentication(self):
-        if self.auth or config["solr_auth"]:
-            username, password = (
-                self.auth.split(":")
-                if self.auth
-                else config["solr_auth"].split(":")
-            )
+        if self.auth:
+            username, password = self.auth.split(":")
             return HTTPBasicAuth(username, password)
     def index_documents(
@@ -69,7 +57,8 @@ class Solr(object):
             if len(documents) <= offset + batch_size:
                 batch = documents[offset:]
             else:
-                batch = documents[offset : (offset + batch_size)]
+                limit = offset + batch_size
+                batch = documents[offset:limit]
             if len(batch) == 0:
                 break

onestop4all_indexer-2.8.0.dev12/harvesters/harvester_service.py DELETED Viewed

@@ -1,224 +0,0 @@
-import logging
-from .harvester_base import Harvester
-from utils import sparql
-from data_repositories.repository_n4eorganization import RepositoryN4EOrganization
-log = logging.getLogger(__name__)
-#harvester for Services https://nfdi4earth.pages.rwth-aachen.de/knowledgehub/nfdi4earth-kh-schema/Service/
-class Service_Harvester(Harvester):
-    sparql_query = """
-        PREFIX fo: <http://www.w3.org/1999/XSL/Format#>
-        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
-        PREFIX dc: <http://purl.org/dc/elements/1.1/>
-        PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
-        prefix dcat: <http://www.w3.org/ns/dcat#>
-        prefix dct: <http://purl.org/dc/terms/>
-        prefix n4e: <http://nfdi4earth.de/ontology/>
-        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-        SELECT  ?subject ?predicate ?object ?contactpoint_email ?contactpoint_url
-        {
-            {
-                ?subject rdf:type <http://schema.org/Service>.
-                ?subject ?predicate ?object
-                FILTER (?predicate NOT IN (dcat:contactPoint, n4e:sourceSystem))
-            }
-            UNION {
-                VALUES ?predicate { dcat:contactPoint }
-                ?subject rdf:type <http://schema.org/Service>;
-                        dcat:contactPoint ?object.
-                optional { ?object vcard:hasEmail ?contactpoint_email. }
-                optional { ?object vcard:hasURL ?contactpoint_url. }
-            }
-        }
-        ORDER BY ?subject ?predicate
-        OFFSET %d
-        LIMIT %d
-    """
-    def __init__(self, n4e_organizations_repo: RepositoryN4EOrganization, **kw):
-        super().__init__(**kw)
-        self.n4e_organizations_repo = n4e_organizations_repo
-    def harvest(self):
-        limit = 5000
-        # convert to list of repo documents for indexing
-        services = {}  # repos dict
-        i = 0
-        hits = {}
-        # split sparql query
-        while True:
-            query_splitted = self.sparql_query % (limit * i, limit)
-            hits = sparql.execute_query(self.sparql_endpoint, query_splitted)
-            self.parse_response(hits, services)
-            i = i + 1
-            if len(hits) < limit:
-                break
-        services_list = []
-        for (
-            key,
-            value,
-        ) in services.items():  # transform repos dict to list for indexing
-            service = value
-            if "mainTitle" not in service and len(service["name"]) > 0:
-                service["mainTitle"] = service["name"][0]
-            service["mainTitle"] = service["mainTitle"].strip()
-            services_list.append(service)
-        return services_list
-    def parse_response(
-            self, hits, services
-        ):
-            for hit in hits:
-                subject = hit["subject"]["value"]
-                predicate = hit["predicate"]["value"]
-                object = hit["object"]["value"]
-                if subject not in services:
-                    services[subject] = {}
-                    services[subject]["uri"] = subject
-                    services[subject]["id"] = self.getID(
-                        subject
-                    )  # use ID from triple store also in Solr to ensure stable IDs
-                if predicate == "http://schema.org/name":  # name
-                    if (
-                        "xml:lang" not in hit["object"]
-                        or hit["object"]["xml:lang"] == "en"
-                    ):  # use international name for orga name
-                        self.addValue(
-                            dict=services[subject],
-                            attribute="name",
-                            value=object,
-                        )
-                        services[subject]["mainTitle"] = object  # mainTitle
-                    if (
-                        "name" not in services[subject]
-                        or object not in services[subject]["name"]
-                    ):  # prevent duplicates
-                        self.addValue(
-                            dict=services[subject],
-                            attribute="name",
-                            value=object,
-                        )
-                elif predicate == "http://schema.org/description":  # description
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="description",
-                        value=object,
-                    )
-                elif predicate == "http://schema.org/additionalType":  # additionalType
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="additionalType",
-                        value=object,
-                    )
-                elif predicate == "http://schema.org/keywords":  # keyword
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="keyword",
-                        value=object,
-                    )
-                elif predicate == "http://schema.org/url":  # url
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="url",
-                        value=object,
-                    )
-                elif predicate == "http://nfdi4earth.de/ontology/serviceType":  # serviceType
-                    services[subject]["serviceType"] = object
-                elif predicate == "http://nfdi4earth.de/ontology/serviceHost":  # serviceHost
-                    services[subject]["serviceHost"] = object
-                elif predicate == "http://www.w3.org/ns/dcat#contactPoint":  # contactPoint
-                    if "contactpoint_email" in hit:
-                        self.addValue(
-                            services[subject],
-                            "contactPoint" + self.flatten_separator + "email",
-                            hit["contactpoint_email"]["value"],
-                        )
-                    if "contactpoint_url" in hit:
-                        self.addValue(
-                            services[subject],
-                            "contactPoint" + self.flatten_separator + "url",
-                            hit["contactpoint_url"]["value"],
-                        )
-                elif (
-                    predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
-                ):  # sourceSystemID
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="sourceSystem" + self.flatten_separator + "id",
-                        value=object,
-                    )
-                elif (
-                    predicate == "http://nfdi4earth.de/ontology/sourceSystem"
-                ):  # sourceSystem
-                    if "sourceSystem_homepage" in hit:
-                        self.addValue(
-                            services[subject],
-                            "sourceSystem" + self.flatten_separator + "homepage",
-                            hit["sourceSystem_homepage"]["value"],
-                        )
-                    if "sourceSystem_title" in hit:
-                        self.addValue(
-                            services[subject],
-                            "sourceSystem" + self.flatten_separator + "title",
-                            hit["sourceSystem_title"]["value"],
-                        )
-                elif (
-                    predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
-                ):  # sourceSystemURL
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="sourceSystemURL",
-                        value=object,
-                    )
-                elif (
-                    predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
-                ):  # altLabel
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="altLabel",
-                        value=object,
-                    )
-                elif (
-                    predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
-                ):  # type
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="type",
-                        value=object
-                    )
-                elif (
-                    predicate == "http://nfdi4earth.de/ontology/serviceType"
-                ):  # serviceType
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="serviceType",
-                        value=object
-                    )
-                elif (
-                    predicate == "http://nfdi4earth.de/ontology/serviceCategory"
-                ):  # serviceCategory
-                    self.addValue(
-                        dict=services[subject],
-                        attribute="serviceCategory",
-                        value=object
-                    )
-                elif (
-                    predicate == "http://nfdi4earth.de/ontology/idHostingInstitution"
-                ): #idHostingInstitution #hostingInstitution_name  #isN4EOperated
-                    host_rorID = object
-                    services[subject]["idHostingInstitution"] = host_rorID #idHostingInstitution
-                    n4e_organization = self.n4e_organizations_repo.get_n4e_organization_by_rorID(host_rorID) #check if rorID belongs to a organization that is n4e member
-                    if n4e_organization is not None:
-                        services[subject]["isN4EOperated"] = True #isN4EOperated
-                        services[subject]["hostingInstitution_name"] = n4e_organization["name"] #hostingInstitution_name  currently only for n4e operated services
-                        #only index if `True`, do not index if `False`