OneStop4All-Indexer 2.8.0.dev12__tar.gz → 2.8.0.dev14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14/OneStop4All_Indexer.egg-info}/PKG-INFO +1 -1
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/OneStop4All_Indexer.egg-info/SOURCES.txt +1 -1
- {onestop4all_indexer-2.8.0.dev12/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev14}/PKG-INFO +1 -1
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/__init__.py +1 -1
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_organization.py +63 -105
- onestop4all_indexer-2.8.0.dev14/harvesters/harvester_service.py +551 -0
- onestop4all_indexer-2.8.0.dev12/harvesters/harvester_softwaresourcecode.py → onestop4all_indexer-2.8.0.dev14/harvesters/harvester_software.py +5 -5
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/setup.py +1 -1
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/cli.py +1 -1
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/harvest.py +2 -2
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/solr.py +14 -25
- onestop4all_indexer-2.8.0.dev12/harvesters/harvester_service.py +0 -224
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/OneStop4All_Indexer.egg-info/requires.txt +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_dataservice.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_dataset.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/configs.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/utils/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev14
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -24,7 +24,7 @@ harvesters/harvester_metadatastandards.py
|
|
|
24
24
|
harvesters/harvester_organization.py
|
|
25
25
|
harvesters/harvester_repository.py
|
|
26
26
|
harvesters/harvester_service.py
|
|
27
|
-
harvesters/
|
|
27
|
+
harvesters/harvester_software.py
|
|
28
28
|
utils/__init__.py
|
|
29
29
|
utils/cli.py
|
|
30
30
|
utils/configs.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev14
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from .harvester_repository import *
|
|
2
2
|
from .harvester_organization import *
|
|
3
3
|
from .harvester_article import *
|
|
4
|
-
from .
|
|
4
|
+
from .harvester_software import *
|
|
5
5
|
from .harvester_learningresource import *
|
|
6
6
|
from .harvester_metadatastandards import *
|
|
7
7
|
from .harvester_document import *
|
|
@@ -88,11 +88,34 @@ class Organization_Harvester(Harvester):
|
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
organization_list = []
|
|
91
|
-
for (
|
|
92
|
-
key,
|
|
93
|
-
value,
|
|
94
|
-
) in organizations.items(): # transform orga dict to list for indexing
|
|
91
|
+
for key, value in organizations.items(): # transform orga dict to list for indexing
|
|
95
92
|
organization = value
|
|
93
|
+
|
|
94
|
+
# clean geometry: if more than one geometry is present,
|
|
95
|
+
# keep only the first one (should not happen, but just in case)
|
|
96
|
+
geom = organization.get("geometry", [])
|
|
97
|
+
if len(geom) > 1:
|
|
98
|
+
organization["geometry"] = geom[:1]
|
|
99
|
+
|
|
100
|
+
# clean rorId: if more than one rorId is present,
|
|
101
|
+
# keep only unique ones (should not happen, but just in case)
|
|
102
|
+
rorid = organization.get("rorId", [])
|
|
103
|
+
if len(rorid) > 1:
|
|
104
|
+
organization["rorId"] = list(set(rorid))
|
|
105
|
+
|
|
106
|
+
# Transform locations
|
|
107
|
+
localities = organization.get("locality", [])
|
|
108
|
+
countries = organization.get("countryName", [])
|
|
109
|
+
complete_locations = []
|
|
110
|
+
|
|
111
|
+
for loc, country in zip(localities, countries):
|
|
112
|
+
complete_locations.append(f"{loc}, {country}")
|
|
113
|
+
|
|
114
|
+
# Filter duplication
|
|
115
|
+
organization["location"] = list(dict.fromkeys(complete_locations))
|
|
116
|
+
organization["locality"] = list(dict.fromkeys(localities))
|
|
117
|
+
organization["countryName"] = list(dict.fromkeys(countries))
|
|
118
|
+
|
|
96
119
|
# ensure mainTitle
|
|
97
120
|
if (
|
|
98
121
|
"mainTitle" not in organization
|
|
@@ -137,9 +160,23 @@ class Organization_Harvester(Harvester):
|
|
|
137
160
|
assignto_dict[subject[0]][attribute] = []
|
|
138
161
|
assignto_dict[subject[0]][attribute].extend(organization_name)
|
|
139
162
|
|
|
163
|
+
def does_object_exist(self, value, attribute: str, data: dict):
|
|
164
|
+
return attribute in data and value in data[attribute]
|
|
165
|
+
|
|
140
166
|
def parse_response(
|
|
141
167
|
self, hits, organizations, issuborganization, hasN4Econtact
|
|
142
168
|
):
|
|
169
|
+
PREDICATES = {
|
|
170
|
+
"http://xmlns.com/foaf/0.1/homepage": "homepage",
|
|
171
|
+
"http://www.w3.org/2002/07/owl#sameAs": "sameAs",
|
|
172
|
+
"http://w3id.org/nfdi4ing/metadata4ing#hasRorId": "rorId",
|
|
173
|
+
"http://www.w3.org/2004/02/skos/core#altLabel": "altLabel",
|
|
174
|
+
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type": "type",
|
|
175
|
+
"http://nfdi4earth.de/ontology/sourceSystemURL": "sourceSystemURL",
|
|
176
|
+
"http://nfdi4earth.de/ontology/hasSignedCommitment": "hasSignedCommitment",
|
|
177
|
+
"http://nfdi4earth.de/ontology/sourceSystemID": "sourceSystem" + self.flatten_separator + "id"
|
|
178
|
+
}
|
|
179
|
+
|
|
143
180
|
for hit in hits:
|
|
144
181
|
subject = hit["subject"]["value"]
|
|
145
182
|
predicate = hit["predicate"]["value"]
|
|
@@ -151,6 +188,8 @@ class Organization_Harvester(Harvester):
|
|
|
151
188
|
organizations[subject]["id"] = self.getID(
|
|
152
189
|
subject
|
|
153
190
|
) # use ID from triple store also in Solr to ensure stable IDs
|
|
191
|
+
organizations[subject]["locality"] = []
|
|
192
|
+
organizations[subject]["countryName"] = []
|
|
154
193
|
|
|
155
194
|
# set geometry if available and not already set
|
|
156
195
|
if (
|
|
@@ -179,8 +218,9 @@ class Organization_Harvester(Harvester):
|
|
|
179
218
|
|
|
180
219
|
if predicate == "http://schema.org/name": # name
|
|
181
220
|
if (
|
|
182
|
-
"xml:lang" not in hit["object"]
|
|
183
|
-
or hit["object"]["xml:lang"] == "en"
|
|
221
|
+
("xml:lang" not in hit["object"]
|
|
222
|
+
or hit["object"]["xml:lang"] == "en")
|
|
223
|
+
and not self.does_object_exist(object, "name", organizations[subject])
|
|
184
224
|
): # use international name for orga name
|
|
185
225
|
self.addValue(
|
|
186
226
|
dict=organizations[subject],
|
|
@@ -189,60 +229,30 @@ class Organization_Harvester(Harvester):
|
|
|
189
229
|
)
|
|
190
230
|
organizations[subject]["mainTitle"] = object # mainTitle
|
|
191
231
|
if (
|
|
192
|
-
"name_alt"
|
|
193
|
-
or object not in organizations[subject]["name_alt"]
|
|
232
|
+
not self.does_object_exist(object, "name_alt", organizations[subject])
|
|
194
233
|
): # prevent duplicates
|
|
195
234
|
self.addValue(
|
|
196
235
|
dict=organizations[subject],
|
|
197
236
|
attribute="name_alt",
|
|
198
237
|
value=object,
|
|
199
238
|
)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
self.
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
elif (
|
|
215
|
-
predicate == "http://www.w3.org/2006/vcard/ns#country-name"
|
|
216
|
-
): # countryName
|
|
217
|
-
self.addValue(
|
|
218
|
-
dict=organizations[subject],
|
|
219
|
-
attribute="countryName",
|
|
220
|
-
value=object,
|
|
221
|
-
)
|
|
222
|
-
elif predicate == "http://www.w3.org/2002/07/owl#sameAs": # sameAs
|
|
223
|
-
self.addValue(
|
|
224
|
-
dict=organizations[subject],
|
|
225
|
-
attribute="sameAs",
|
|
226
|
-
value=object,
|
|
227
|
-
)
|
|
228
|
-
elif (
|
|
229
|
-
predicate == "http://w3id.org/nfdi4ing/metadata4ing#hasRorId"
|
|
230
|
-
): # rorId
|
|
231
|
-
self.addValue(
|
|
232
|
-
dict=organizations[subject],
|
|
233
|
-
attribute="rorId",
|
|
234
|
-
value=object,
|
|
235
|
-
)
|
|
236
|
-
elif (
|
|
237
|
-
predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
|
|
238
|
-
): # altLabel
|
|
239
|
-
self.addValue(
|
|
240
|
-
dict=organizations[subject],
|
|
241
|
-
attribute="altLabel",
|
|
242
|
-
value=object,
|
|
243
|
-
)
|
|
239
|
+
|
|
240
|
+
elif predicate == "http://www.w3.org/2006/vcard/ns#locality":
|
|
241
|
+
organizations[subject]["locality"].append(object)
|
|
242
|
+
|
|
243
|
+
elif predicate == "http://www.w3.org/2006/vcard/ns#country-name":
|
|
244
|
+
organizations[subject]["countryName"].append(object)
|
|
245
|
+
|
|
246
|
+
elif predicate in PREDICATES:
|
|
247
|
+
attribute = PREDICATES[predicate]
|
|
248
|
+
if not self.does_object_exist(object, attribute, organizations[subject]):
|
|
249
|
+
self.addValue(
|
|
250
|
+
dict=organizations[subject],
|
|
251
|
+
attribute=attribute,
|
|
252
|
+
value=object)
|
|
244
253
|
elif (
|
|
245
254
|
predicate == "http://www.w3.org/ns/org#subOrganizationOf"
|
|
255
|
+
and (subject, object) not in issuborganization
|
|
246
256
|
): # subOrganizationOf
|
|
247
257
|
issuborganization.append(
|
|
248
258
|
(subject, object)
|
|
@@ -250,60 +260,8 @@ class Organization_Harvester(Harvester):
|
|
|
250
260
|
elif (
|
|
251
261
|
predicate
|
|
252
262
|
== "http://nfdi4earth.de/ontology/hasNFDI4EarthContactPerson"
|
|
263
|
+
and (subject, object) not in hasN4Econtact
|
|
253
264
|
): # NFDI4EarthContactPerson
|
|
254
265
|
hasN4Econtact.append(
|
|
255
266
|
(subject, object)
|
|
256
267
|
) # store, resolve contact info later
|
|
257
|
-
elif (
|
|
258
|
-
predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
259
|
-
): # type
|
|
260
|
-
self.addValue(
|
|
261
|
-
dict=organizations[subject], attribute="type", value=object
|
|
262
|
-
)
|
|
263
|
-
elif (
|
|
264
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystem"
|
|
265
|
-
): # sourceSystem
|
|
266
|
-
if "sourceSystem_homepage" in hit:
|
|
267
|
-
self.addValue(
|
|
268
|
-
organizations[subject],
|
|
269
|
-
"sourceSystem" + self.flatten_separator + "homepage",
|
|
270
|
-
hit["sourceSystem_homepage"]["value"],
|
|
271
|
-
)
|
|
272
|
-
if "sourceSystem_title" in hit:
|
|
273
|
-
self.addValue(
|
|
274
|
-
organizations[subject],
|
|
275
|
-
"sourceSystem" + self.flatten_separator + "title",
|
|
276
|
-
hit["sourceSystem_title"]["value"],
|
|
277
|
-
)
|
|
278
|
-
elif (
|
|
279
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
|
|
280
|
-
): # sourceSystemID
|
|
281
|
-
if (
|
|
282
|
-
"sourceSystem" + self.flatten_separator + "id"
|
|
283
|
-
not in organizations[subject]
|
|
284
|
-
):
|
|
285
|
-
# only set if not already present
|
|
286
|
-
self.addValue(
|
|
287
|
-
dict=organizations[subject],
|
|
288
|
-
attribute="sourceSystem"
|
|
289
|
-
+ self.flatten_separator
|
|
290
|
-
+ "id",
|
|
291
|
-
value=object,
|
|
292
|
-
)
|
|
293
|
-
elif (
|
|
294
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
|
|
295
|
-
): # sourceSystemURL
|
|
296
|
-
self.addValue(
|
|
297
|
-
dict=organizations[subject],
|
|
298
|
-
attribute="sourceSystemURL",
|
|
299
|
-
value=object,
|
|
300
|
-
)
|
|
301
|
-
elif (
|
|
302
|
-
predicate
|
|
303
|
-
== "http://nfdi4earth.de/ontology/hasSignedCommitment"
|
|
304
|
-
): # hasSignedCommitment
|
|
305
|
-
self.addValue(
|
|
306
|
-
dict=organizations[subject],
|
|
307
|
-
attribute="hasSignedCommitment",
|
|
308
|
-
value=object,
|
|
309
|
-
)
|
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from .harvester_base import Harvester
|
|
3
|
+
from utils import sparql
|
|
4
|
+
from data_repositories.repository_n4eorganization import (
|
|
5
|
+
RepositoryN4EOrganization,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# harvester for Services https://nfdi4earth.pages.rwth-aachen.de/knowledgehub/nfdi4earth-kh-schema/Service/
|
|
12
|
+
class Service_Harvester(Harvester):
|
|
13
|
+
sparql_query = """
|
|
14
|
+
PREFIX fo: <http://www.w3.org/1999/XSL/Format#>
|
|
15
|
+
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
16
|
+
PREFIX dc: <http://purl.org/dc/elements/1.1/>
|
|
17
|
+
PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
|
|
18
|
+
prefix dcat: <http://www.w3.org/ns/dcat#>
|
|
19
|
+
prefix dct: <http://purl.org/dc/terms/>
|
|
20
|
+
prefix n4e: <http://nfdi4earth.de/ontology/>
|
|
21
|
+
PREFIX m4i: <http://w3id.org/nfdi4ing/metadata4ing#>
|
|
22
|
+
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
|
|
23
|
+
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
24
|
+
PREFIX schema: <http://schema.org/>
|
|
25
|
+
|
|
26
|
+
SELECT ?subject ?predicate ?object
|
|
27
|
+
?contactpoint_email
|
|
28
|
+
?contactpoint_url
|
|
29
|
+
?contact_fn
|
|
30
|
+
?contact_email
|
|
31
|
+
?serviceProvider_homepage ?serviceProvider_imprint ?serviceProvider_rorId ?serviceProvider_name
|
|
32
|
+
?serviceLocationPoint
|
|
33
|
+
?tangibleKPI_kpiType ?tangibleKPI_kpiValue ?tangibleKPI_kpiNotes
|
|
34
|
+
WHERE {
|
|
35
|
+
{
|
|
36
|
+
# Page over distinct subjects first — avoids ORDER BY + OFFSET on the full
|
|
37
|
+
# UNION result set which forces full materialization and causes timeouts.
|
|
38
|
+
SELECT DISTINCT ?subject
|
|
39
|
+
WHERE {
|
|
40
|
+
?subject rdf:type n4e:Service .
|
|
41
|
+
}
|
|
42
|
+
OFFSET %d
|
|
43
|
+
LIMIT %d
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
{
|
|
47
|
+
?subject ?predicate ?object .
|
|
48
|
+
FILTER (?predicate NOT IN (dcat:contactPoint, n4e:sourceSystem))
|
|
49
|
+
}
|
|
50
|
+
UNION {
|
|
51
|
+
VALUES ?predicate { dcat:contactPoint }
|
|
52
|
+
?subject dcat:contactPoint ?object .
|
|
53
|
+
OPTIONAL { ?object vcard:hasEmail ?contactpoint_email . }
|
|
54
|
+
OPTIONAL { ?object vcard:hasURL ?contactpoint_url . }
|
|
55
|
+
}
|
|
56
|
+
UNION {
|
|
57
|
+
VALUES ?predicate { n4e:firstLevelSupportContact }
|
|
58
|
+
?subject n4e:firstLevelSupportContact ?object .
|
|
59
|
+
OPTIONAL { ?object vcard:fn ?contact_fn . }
|
|
60
|
+
OPTIONAL { ?object vcard:hasEmail ?contact_email . }
|
|
61
|
+
}
|
|
62
|
+
UNION {
|
|
63
|
+
VALUES ?predicate { n4e:securityIncidentContact }
|
|
64
|
+
?subject n4e:securityIncidentContact ?object .
|
|
65
|
+
OPTIONAL { ?object vcard:fn ?contact_fn . }
|
|
66
|
+
OPTIONAL { ?object vcard:hasEmail ?contact_email . }
|
|
67
|
+
}
|
|
68
|
+
UNION {
|
|
69
|
+
VALUES ?predicate { n4e:serviceOwner }
|
|
70
|
+
?subject n4e:serviceOwner ?object .
|
|
71
|
+
OPTIONAL { ?object vcard:fn ?contact_fn . }
|
|
72
|
+
OPTIONAL { ?object vcard:hasEmail ?contact_email . }
|
|
73
|
+
}
|
|
74
|
+
UNION {
|
|
75
|
+
VALUES ?predicate { n4e:serviceManager }
|
|
76
|
+
?subject n4e:serviceManager ?object .
|
|
77
|
+
OPTIONAL { ?object vcard:fn ?contact_fn . }
|
|
78
|
+
OPTIONAL { ?object vcard:hasEmail ?contact_email . }
|
|
79
|
+
}
|
|
80
|
+
UNION {
|
|
81
|
+
VALUES ?predicate { n4e:serviceProvider }
|
|
82
|
+
?subject n4e:serviceProvider ?object .
|
|
83
|
+
OPTIONAL { ?object foaf:homepage ?serviceProvider_homepage . }
|
|
84
|
+
OPTIONAL { ?object n4e:hasImprint ?serviceProvider_imprint . }
|
|
85
|
+
OPTIONAL { ?object m4i:hasRorId ?serviceProvider_rorId . }
|
|
86
|
+
OPTIONAL { ?object schema:name ?serviceProvider_name . }
|
|
87
|
+
}
|
|
88
|
+
UNION {
|
|
89
|
+
VALUES ?predicate { n4e:serviceLocation }
|
|
90
|
+
?subject n4e:serviceLocation ?object .
|
|
91
|
+
OPTIONAL { ?object geo:asWKT ?serviceLocationPoint . }
|
|
92
|
+
}
|
|
93
|
+
UNION {
|
|
94
|
+
VALUES ?predicate { n4e:tangibleKPI }
|
|
95
|
+
?subject n4e:tangibleKPI ?object .
|
|
96
|
+
OPTIONAL { ?object n4e:kpiType ?tangibleKPI_kpiType . }
|
|
97
|
+
OPTIONAL { ?object n4e:kpiValue ?tangibleKPI_kpiValue . }
|
|
98
|
+
OPTIONAL { ?object n4e:kpiNotes ?tangibleKPI_kpiNotes . }
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self, n4e_organizations_repo: RepositoryN4EOrganization, **kw
|
|
105
|
+
):
|
|
106
|
+
super().__init__(**kw)
|
|
107
|
+
self.n4e_organizations_repo = n4e_organizations_repo
|
|
108
|
+
|
|
109
|
+
def harvest(self):
|
|
110
|
+
limit = 5000
|
|
111
|
+
# convert to list of repo documents for indexing
|
|
112
|
+
services = {} # repos dict
|
|
113
|
+
|
|
114
|
+
i = 0
|
|
115
|
+
# split sparql query by paging over distinct subjects
|
|
116
|
+
# (sub-SELECT OFFSET/LIMIT)
|
|
117
|
+
while True:
|
|
118
|
+
query_splitted = self.sparql_query % (limit * i, limit)
|
|
119
|
+
hits = sparql.execute_query(self.sparql_endpoint, query_splitted)
|
|
120
|
+
|
|
121
|
+
subjects_before = len(services)
|
|
122
|
+
self.parse_response(hits, services)
|
|
123
|
+
new_subjects = len(services) - subjects_before
|
|
124
|
+
|
|
125
|
+
i += 1
|
|
126
|
+
|
|
127
|
+
# Stop when the sub-SELECT returned fewer subjects than the page size
|
|
128
|
+
if new_subjects < limit:
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
services_list = []
|
|
132
|
+
for (
|
|
133
|
+
key,
|
|
134
|
+
value,
|
|
135
|
+
) in services.items(): # transform repos dict to list for indexing
|
|
136
|
+
service = value
|
|
137
|
+
if "mainTitle" not in service and len(service["name"]) > 0:
|
|
138
|
+
service["mainTitle"] = service["name"][0]
|
|
139
|
+
service["mainTitle"] = service["mainTitle"].strip()
|
|
140
|
+
services_list.append(service)
|
|
141
|
+
|
|
142
|
+
return services_list
|
|
143
|
+
|
|
144
|
+
def parse_response(
|
|
145
|
+
self, hits, services
|
|
146
|
+
):
|
|
147
|
+
for hit in hits:
|
|
148
|
+
subject = hit["subject"]["value"]
|
|
149
|
+
predicate = hit["predicate"]["value"]
|
|
150
|
+
object = hit["object"]["value"]
|
|
151
|
+
|
|
152
|
+
if subject not in services:
|
|
153
|
+
services[subject] = {}
|
|
154
|
+
services[subject]["uri"] = subject
|
|
155
|
+
services[subject]["id"] = self.getID(
|
|
156
|
+
subject
|
|
157
|
+
) # use ID from triple store also in Solr to ensure stable IDs
|
|
158
|
+
|
|
159
|
+
if predicate == "http://schema.org/name": # name
|
|
160
|
+
if (
|
|
161
|
+
"xml:lang" not in hit["object"]
|
|
162
|
+
or hit["object"]["xml:lang"] == "en"
|
|
163
|
+
): # use international name for orga name
|
|
164
|
+
self.addValue(
|
|
165
|
+
dict=services[subject],
|
|
166
|
+
attribute="name",
|
|
167
|
+
value=object,
|
|
168
|
+
)
|
|
169
|
+
services[subject]["mainTitle"] = object # mainTitle
|
|
170
|
+
if (
|
|
171
|
+
"name" not in services[subject]
|
|
172
|
+
or object not in services[subject]["name"]
|
|
173
|
+
): # prevent duplicates
|
|
174
|
+
self.addValue(
|
|
175
|
+
dict=services[subject],
|
|
176
|
+
attribute="name",
|
|
177
|
+
value=object,
|
|
178
|
+
)
|
|
179
|
+
elif predicate == "http://schema.org/description": # description
|
|
180
|
+
self.addValue(
|
|
181
|
+
dict=services[subject],
|
|
182
|
+
attribute="description",
|
|
183
|
+
value=object,
|
|
184
|
+
)
|
|
185
|
+
elif predicate == "http://schema.org/additionalType": # additionalType
|
|
186
|
+
self.addValue(
|
|
187
|
+
dict=services[subject],
|
|
188
|
+
attribute="additionalType",
|
|
189
|
+
value=object,
|
|
190
|
+
)
|
|
191
|
+
elif predicate == "http://schema.org/keywords": # keyword
|
|
192
|
+
self.addValue(
|
|
193
|
+
dict=services[subject],
|
|
194
|
+
attribute="keyword",
|
|
195
|
+
value=object,
|
|
196
|
+
)
|
|
197
|
+
elif predicate == "http://schema.org/url": # url
|
|
198
|
+
self.addValue(
|
|
199
|
+
dict=services[subject],
|
|
200
|
+
attribute="url",
|
|
201
|
+
value=object,
|
|
202
|
+
)
|
|
203
|
+
elif predicate == "http://nfdi4earth.de/ontology/serviceType": # serviceType
|
|
204
|
+
services[subject]["serviceType"] = object
|
|
205
|
+
elif predicate == "http://nfdi4earth.de/ontology/serviceHost": # serviceHost
|
|
206
|
+
services[subject]["serviceHost"] = object
|
|
207
|
+
elif predicate == "http://www.w3.org/ns/dcat#contactPoint": # contactPoint
|
|
208
|
+
if "contactpoint_email" in hit:
|
|
209
|
+
self.addValue(
|
|
210
|
+
services[subject],
|
|
211
|
+
"contactPoint" + self.flatten_separator + "email",
|
|
212
|
+
hit["contactpoint_email"]["value"],
|
|
213
|
+
)
|
|
214
|
+
if "contactpoint_url" in hit:
|
|
215
|
+
self.addValue(
|
|
216
|
+
services[subject],
|
|
217
|
+
"contactPoint" + self.flatten_separator + "url",
|
|
218
|
+
hit["contactpoint_url"]["value"],
|
|
219
|
+
)
|
|
220
|
+
elif predicate == "http://nfdi4earth.de/ontology/firstLevelSupportContact":
|
|
221
|
+
if "contact_fn" in hit:
|
|
222
|
+
self.addValue(
|
|
223
|
+
services[subject],
|
|
224
|
+
"firstLevelSupportContact" + self.flatten_separator + "fullname",
|
|
225
|
+
hit["contact_fn"]["value"],
|
|
226
|
+
)
|
|
227
|
+
if "contact_email" in hit:
|
|
228
|
+
self.addValue(
|
|
229
|
+
services[subject],
|
|
230
|
+
"firstLevelSupportContact" + self.flatten_separator + "hasEmail",
|
|
231
|
+
hit["contact_email"]["value"],
|
|
232
|
+
)
|
|
233
|
+
elif predicate == "http://nfdi4earth.de/ontology/serviceOwner":
|
|
234
|
+
if "contact_fn" in hit:
|
|
235
|
+
self.addValue(
|
|
236
|
+
services[subject],
|
|
237
|
+
"serviceOwner" + self.flatten_separator + "fullname",
|
|
238
|
+
hit["contact_fn"]["value"],
|
|
239
|
+
)
|
|
240
|
+
if "contact_email" in hit:
|
|
241
|
+
self.addValue(
|
|
242
|
+
services[subject],
|
|
243
|
+
"serviceOwner" + self.flatten_separator + "hasEmail",
|
|
244
|
+
hit["contact_email"]["value"],
|
|
245
|
+
)
|
|
246
|
+
elif predicate == "http://nfdi4earth.de/ontology/serviceManager":
|
|
247
|
+
if "contact_fn" in hit:
|
|
248
|
+
self.addValue(
|
|
249
|
+
services[subject],
|
|
250
|
+
"serviceManager" + self.flatten_separator + "fullname",
|
|
251
|
+
hit["contact_fn"]["value"],
|
|
252
|
+
)
|
|
253
|
+
if "contact_email" in hit:
|
|
254
|
+
self.addValue(
|
|
255
|
+
services[subject],
|
|
256
|
+
"serviceManager" + self.flatten_separator + "hasEmail",
|
|
257
|
+
hit["contact_email"]["value"],
|
|
258
|
+
)
|
|
259
|
+
elif predicate == "http://nfdi4earth.de/ontology/serviceProvider":
|
|
260
|
+
if "serviceProvider_homepage" in hit:
|
|
261
|
+
self.addValue(
|
|
262
|
+
services[subject],
|
|
263
|
+
"serviceProvider" + self.flatten_separator + "homepage",
|
|
264
|
+
hit["serviceProvider_homepage"]["value"],
|
|
265
|
+
)
|
|
266
|
+
if "serviceProvider_imprint" in hit:
|
|
267
|
+
self.addValue(
|
|
268
|
+
services[subject],
|
|
269
|
+
"serviceProvider" + self.flatten_separator + "imprint",
|
|
270
|
+
hit["serviceProvider_imprint"]["value"],
|
|
271
|
+
)
|
|
272
|
+
if "serviceProvider_rorId" in hit:
|
|
273
|
+
self.addValue(
|
|
274
|
+
services[subject],
|
|
275
|
+
"serviceProvider" + self.flatten_separator + "rorId",
|
|
276
|
+
hit["serviceProvider_rorId"]["value"],
|
|
277
|
+
)
|
|
278
|
+
if "serviceProvider_name" in hit:
|
|
279
|
+
self.addValue(
|
|
280
|
+
services[subject],
|
|
281
|
+
"serviceProvider" + self.flatten_separator + "name",
|
|
282
|
+
hit["serviceProvider_name"]["value"],
|
|
283
|
+
)
|
|
284
|
+
elif predicate == "http://nfdi4earth.de/ontology/serviceLocation":
|
|
285
|
+
if "serviceLocationPoint" in hit:
|
|
286
|
+
self.addValue(
|
|
287
|
+
services[subject],
|
|
288
|
+
"serviceLocationPoint",
|
|
289
|
+
hit["serviceLocationPoint"]["value"],
|
|
290
|
+
)
|
|
291
|
+
elif predicate == "http://nfdi4earth.de/ontology/securityIncidentContact":
|
|
292
|
+
if "contact_fn" in hit:
|
|
293
|
+
self.addValue(
|
|
294
|
+
services[subject],
|
|
295
|
+
"securityIncidentContact" + self.flatten_separator + "fullname",
|
|
296
|
+
hit["contact_fn"]["value"],
|
|
297
|
+
)
|
|
298
|
+
if "contact_email" in hit:
|
|
299
|
+
self.addValue(
|
|
300
|
+
services[subject],
|
|
301
|
+
"securityIncidentContact" + self.flatten_separator + "hasEmail",
|
|
302
|
+
hit["contact_email"]["value"],
|
|
303
|
+
)
|
|
304
|
+
elif (
|
|
305
|
+
predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
|
|
306
|
+
): # sourceSystemID
|
|
307
|
+
self.addValue(
|
|
308
|
+
dict=services[subject],
|
|
309
|
+
attribute="sourceSystem" + self.flatten_separator + "id",
|
|
310
|
+
value=object,
|
|
311
|
+
)
|
|
312
|
+
elif (
|
|
313
|
+
predicate == "http://nfdi4earth.de/ontology/sourceSystem"
|
|
314
|
+
): # sourceSystem
|
|
315
|
+
if "sourceSystem_homepage" in hit:
|
|
316
|
+
self.addValue(
|
|
317
|
+
services[subject],
|
|
318
|
+
"sourceSystem" + self.flatten_separator + "homepage",
|
|
319
|
+
hit["sourceSystem_homepage"]["value"],
|
|
320
|
+
)
|
|
321
|
+
if "sourceSystem_title" in hit:
|
|
322
|
+
self.addValue(
|
|
323
|
+
services[subject],
|
|
324
|
+
"sourceSystem" + self.flatten_separator + "title",
|
|
325
|
+
hit["sourceSystem_title"]["value"],
|
|
326
|
+
)
|
|
327
|
+
elif (
|
|
328
|
+
predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
|
|
329
|
+
): # sourceSystemURL
|
|
330
|
+
self.addValue(
|
|
331
|
+
dict=services[subject],
|
|
332
|
+
attribute="sourceSystemURL",
|
|
333
|
+
value=object,
|
|
334
|
+
)
|
|
335
|
+
elif (
|
|
336
|
+
predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
|
|
337
|
+
): # altLabel
|
|
338
|
+
self.addValue(
|
|
339
|
+
dict=services[subject],
|
|
340
|
+
attribute="altLabel",
|
|
341
|
+
value=object,
|
|
342
|
+
)
|
|
343
|
+
elif (
|
|
344
|
+
predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
345
|
+
): # type
|
|
346
|
+
self.addValue(
|
|
347
|
+
dict=services[subject],
|
|
348
|
+
attribute="type",
|
|
349
|
+
value=object
|
|
350
|
+
)
|
|
351
|
+
elif (
|
|
352
|
+
predicate == "http://nfdi4earth.de/ontology/serviceType"
|
|
353
|
+
): # serviceType
|
|
354
|
+
self.addValue(
|
|
355
|
+
dict=services[subject],
|
|
356
|
+
attribute="serviceType",
|
|
357
|
+
value=object
|
|
358
|
+
)
|
|
359
|
+
elif (
|
|
360
|
+
predicate == "http://nfdi4earth.de/ontology/serviceCategory"
|
|
361
|
+
): # serviceCategory
|
|
362
|
+
self.addValue(
|
|
363
|
+
dict=services[subject],
|
|
364
|
+
attribute="serviceCategory",
|
|
365
|
+
value=object
|
|
366
|
+
)
|
|
367
|
+
elif (
|
|
368
|
+
predicate == "http://nfdi4earth.de/ontology/linkToDocumentation"
|
|
369
|
+
): # linkToDocumentation
|
|
370
|
+
self.addValue(
|
|
371
|
+
dict=services[subject],
|
|
372
|
+
attribute="linkToDocumentation",
|
|
373
|
+
value=object
|
|
374
|
+
)
|
|
375
|
+
elif (
|
|
376
|
+
predicate == "http://nfdi4earth.de/ontology/nameAbbreviation"
|
|
377
|
+
): # nameAbbreviation
|
|
378
|
+
self.addValue(
|
|
379
|
+
dict=services[subject],
|
|
380
|
+
attribute="nameAbbreviation",
|
|
381
|
+
value=object
|
|
382
|
+
)
|
|
383
|
+
elif (
|
|
384
|
+
predicate == "http://nfdi4earth.de/ontology/shortDescription"
|
|
385
|
+
): # shortDescription
|
|
386
|
+
self.addValue(
|
|
387
|
+
dict=services[subject],
|
|
388
|
+
attribute="shortDescription",
|
|
389
|
+
value=object
|
|
390
|
+
)
|
|
391
|
+
elif (
|
|
392
|
+
predicate == "http://nfdi4earth.de/ontology/chargeFree"
|
|
393
|
+
): # chargeFree //BOOLEAN
|
|
394
|
+
if object == "1":
|
|
395
|
+
services[subject]["chargeFree"] = True
|
|
396
|
+
elif object == "0":
|
|
397
|
+
services[subject]["chargeFree"] = False
|
|
398
|
+
elif (
|
|
399
|
+
predicate == "http://nfdi4earth.de/ontology/nonProfit"
|
|
400
|
+
): # nonProfit //BOOLEAN
|
|
401
|
+
if object == "1":
|
|
402
|
+
services[subject]["nonProfit"] = True
|
|
403
|
+
elif object == "0":
|
|
404
|
+
services[subject]["nonProfit"] = False
|
|
405
|
+
elif (
|
|
406
|
+
predicate == "http://nfdi4earth.de/ontology/adFree"
|
|
407
|
+
): # adFree //BOOLEAN
|
|
408
|
+
if object == "1":
|
|
409
|
+
services[subject]["adFree"] = True
|
|
410
|
+
elif object == "0":
|
|
411
|
+
services[subject]["adFree"] = False
|
|
412
|
+
elif (
|
|
413
|
+
predicate == "http://nfdi4earth.de/ontology/fees"
|
|
414
|
+
): # fees
|
|
415
|
+
self.addValue(
|
|
416
|
+
dict=services[subject],
|
|
417
|
+
attribute="fees",
|
|
418
|
+
value=object
|
|
419
|
+
)
|
|
420
|
+
elif (
|
|
421
|
+
predicate == "http://nfdi4earth.de/ontology/serviceAccessType"
|
|
422
|
+
): # serviceAccessType
|
|
423
|
+
self.addValue(
|
|
424
|
+
dict=services[subject],
|
|
425
|
+
attribute="serviceAccessType",
|
|
426
|
+
value=object
|
|
427
|
+
)
|
|
428
|
+
elif (
|
|
429
|
+
predicate == "http://nfdi4earth.de/ontology/logo"
|
|
430
|
+
): # logo
|
|
431
|
+
self.addValue(
|
|
432
|
+
dict=services[subject],
|
|
433
|
+
attribute="logo",
|
|
434
|
+
value=object
|
|
435
|
+
)
|
|
436
|
+
elif (
|
|
437
|
+
predicate == "http://nfdi4earth.de/ontology/userEnablement"
|
|
438
|
+
): # userEnablement
|
|
439
|
+
self.addValue(
|
|
440
|
+
dict=services[subject],
|
|
441
|
+
attribute="userEnablement",
|
|
442
|
+
value=object
|
|
443
|
+
)
|
|
444
|
+
elif (
|
|
445
|
+
predicate == "http://nfdi4earth.de/ontology/serviceEnablement"
|
|
446
|
+
): # serviceEnablement
|
|
447
|
+
self.addValue(
|
|
448
|
+
dict=services[subject],
|
|
449
|
+
attribute="serviceEnablement",
|
|
450
|
+
value=object
|
|
451
|
+
)
|
|
452
|
+
elif (
|
|
453
|
+
predicate == "http://nfdi4earth.de/ontology/personalDataProcessingAndStorage"
|
|
454
|
+
): # personalDataProcessingAndStorage
|
|
455
|
+
self.addValue(
|
|
456
|
+
dict=services[subject],
|
|
457
|
+
attribute="personalDataProcessingAndStorage",
|
|
458
|
+
value=object
|
|
459
|
+
)
|
|
460
|
+
elif (
|
|
461
|
+
predicate == "http://nfdi4earth.de/ontology/dataProtectionAndBackup"
|
|
462
|
+
): # dataProtectionAndBackup
|
|
463
|
+
self.addValue(
|
|
464
|
+
dict=services[subject],
|
|
465
|
+
attribute="dataProtectionAndBackup",
|
|
466
|
+
value=object
|
|
467
|
+
)
|
|
468
|
+
elif (
|
|
469
|
+
predicate == "http://nfdi4earth.de/ontology/securityIncidentContact"
|
|
470
|
+
): # securityIncidentContact //ndoID
|
|
471
|
+
self.addValue(
|
|
472
|
+
dict=services[subject],
|
|
473
|
+
attribute="securityIncidentContact",
|
|
474
|
+
value=object
|
|
475
|
+
)
|
|
476
|
+
elif (
|
|
477
|
+
predicate == "http://nfdi4earth.de/ontology/servicePrivacyPolicy"
|
|
478
|
+
): # servicePrivacyPolicy
|
|
479
|
+
self.addValue(
|
|
480
|
+
dict=services[subject],
|
|
481
|
+
attribute="servicePrivacyPolicy",
|
|
482
|
+
value=object
|
|
483
|
+
)
|
|
484
|
+
elif (
|
|
485
|
+
predicate == "http://nfdi4earth.de/ontology/businessModel"
|
|
486
|
+
): # businessModel
|
|
487
|
+
self.addValue(
|
|
488
|
+
dict=services[subject],
|
|
489
|
+
attribute="businessModel",
|
|
490
|
+
value=object
|
|
491
|
+
)
|
|
492
|
+
elif (
|
|
493
|
+
predicate == "http://nfdi4earth.de/ontology/GDPRCompliant"
|
|
494
|
+
): # GDPRCompliant //BOOLEAN
|
|
495
|
+
if object == "1":
|
|
496
|
+
services[subject]["GDPRCompliant"] = True
|
|
497
|
+
elif object == "0":
|
|
498
|
+
services[subject]["GDPRCompliant"] = False
|
|
499
|
+
elif (
|
|
500
|
+
predicate == "http://nfdi4earth.de/ontology/servicePublicationConsent"
|
|
501
|
+
): # servicePublicationConsent //BOOLEAN
|
|
502
|
+
if object == "1":
|
|
503
|
+
services[subject]["servicePublicationConsent"] = True
|
|
504
|
+
elif object == "0":
|
|
505
|
+
services[subject]["servicePublicationConsent"] = False
|
|
506
|
+
elif (
|
|
507
|
+
predicate == "http://nfdi4earth.de/ontology/contactWithPortfolioManagement"
|
|
508
|
+
): # contactWithPortfolioManagement
|
|
509
|
+
self.addValue(
|
|
510
|
+
dict=services[subject],
|
|
511
|
+
attribute="contactWithPortfolioManagement",
|
|
512
|
+
value=object
|
|
513
|
+
)
|
|
514
|
+
elif (
|
|
515
|
+
predicate == "http://nfdi4earth.de/ontology/limitations"
|
|
516
|
+
): # limitations
|
|
517
|
+
self.addValue(
|
|
518
|
+
dict=services[subject],
|
|
519
|
+
attribute="limitations",
|
|
520
|
+
value=object
|
|
521
|
+
)
|
|
522
|
+
elif predicate == "http://nfdi4earth.de/ontology/tangibleKPI":
|
|
523
|
+
if "tangibleKPI_kpiType" in hit:
|
|
524
|
+
self.addValue(
|
|
525
|
+
services[subject],
|
|
526
|
+
"tangibleKPI" + self.flatten_separator + "kpiType",
|
|
527
|
+
hit["tangibleKPI_kpiType"]["value"],
|
|
528
|
+
)
|
|
529
|
+
if "tangibleKPI_kpiValue" in hit:
|
|
530
|
+
self.addValue(
|
|
531
|
+
services[subject],
|
|
532
|
+
"tangibleKPI" + self.flatten_separator + "kpiValue",
|
|
533
|
+
hit["tangibleKPI_kpiValue"]["value"],
|
|
534
|
+
)
|
|
535
|
+
if "tangibleKPI_kpiNotes" in hit:
|
|
536
|
+
self.addValue(
|
|
537
|
+
services[subject],
|
|
538
|
+
"tangibleKPI" + self.flatten_separator + "kpiNotes",
|
|
539
|
+
hit["tangibleKPI_kpiNotes"]["value"],
|
|
540
|
+
)
|
|
541
|
+
elif (
|
|
542
|
+
predicate == "http://nfdi4earth.de/ontology/idHostingInstitution"
|
|
543
|
+
): #idHostingInstitution #hostingInstitution_name #isN4EOperated
|
|
544
|
+
host_rorID = object
|
|
545
|
+
services[subject]["idHostingInstitution"] = host_rorID #idHostingInstitution
|
|
546
|
+
|
|
547
|
+
n4e_organization = self.n4e_organizations_repo.get_n4e_organization_by_rorID(host_rorID) #check if rorID belongs to a organization that is n4e member
|
|
548
|
+
if n4e_organization is not None:
|
|
549
|
+
services[subject]["isN4EOperated"] = True #isN4EOperated
|
|
550
|
+
services[subject]["hostingInstitution_name"] = n4e_organization["name"] #hostingInstitution_name currently only for n4e operated services
|
|
551
|
+
#only index if `True`, do not index if `False`
|
|
@@ -7,7 +7,7 @@ from utils import sparql
|
|
|
7
7
|
log = logging.getLogger(__name__)
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class
|
|
10
|
+
class Software_Harvester(Harvester):
|
|
11
11
|
sparql_query = """
|
|
12
12
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
13
13
|
prefix n4e: <http://nfdi4earth.de/ontology/>
|
|
@@ -17,25 +17,25 @@ class Softwaresourcecode_Harvester(Harvester):
|
|
|
17
17
|
SELECT ?subject ?predicate ?object ?sourceSystem_homepage ?sourceSystem_title
|
|
18
18
|
WHERE {
|
|
19
19
|
{
|
|
20
|
-
?subject rdf:type
|
|
20
|
+
?subject rdf:type n4e:Software.
|
|
21
21
|
?subject ?predicate ?object
|
|
22
22
|
FILTER (?predicate NOT IN (<http://schema.org/publisher>, <http://schema.org/audience>))
|
|
23
23
|
}
|
|
24
24
|
UNION{
|
|
25
25
|
VALUES ?predicate { <http://schema.org/publisher> }
|
|
26
|
-
?subject rdf:type
|
|
26
|
+
?subject rdf:type n4e:Software;
|
|
27
27
|
<http://schema.org/publisher> ?publisher.
|
|
28
28
|
?publisher <http://schema.org/name> ?object.
|
|
29
29
|
}
|
|
30
30
|
UNION{
|
|
31
31
|
VALUES ?predicate { <http://schema.org/audience> }
|
|
32
|
-
?subject rdf:type
|
|
32
|
+
?subject rdf:type n4e:Software;
|
|
33
33
|
<http://schema.org/audience> ?audience.
|
|
34
34
|
?audience dct:title ?object.
|
|
35
35
|
}
|
|
36
36
|
UNION{
|
|
37
37
|
VALUES ?predicate { n4e:sourceSystem }
|
|
38
|
-
?subject rdf:type
|
|
38
|
+
?subject rdf:type n4e:Software;
|
|
39
39
|
n4e:sourceSystem ?object.
|
|
40
40
|
optional {?object dct:title ?sourceSystem_title.}
|
|
41
41
|
optional {?object foaf:homepage ?sourceSystem_homepage.}
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="OneStop4All-Indexer",
|
|
5
|
-
version="2.8.0.
|
|
5
|
+
version="2.8.0.dev14",
|
|
6
6
|
description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
|
|
7
7
|
author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
|
|
8
8
|
author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
|
|
@@ -16,7 +16,7 @@ from harvesters import (
|
|
|
16
16
|
Repository_Harvester,
|
|
17
17
|
Organization_Harvester,
|
|
18
18
|
Article_Harvester,
|
|
19
|
-
|
|
19
|
+
Software_Harvester,
|
|
20
20
|
Learningresource_Harvester,
|
|
21
21
|
Metadatastandard_Harvester,
|
|
22
22
|
Document_Harvester,
|
|
@@ -73,7 +73,7 @@ def get_harvester(
|
|
|
73
73
|
get_repo("links_repo"),
|
|
74
74
|
get_repo("themes_repo"),
|
|
75
75
|
),
|
|
76
|
-
"
|
|
76
|
+
"Software": lambda: Software_Harvester(
|
|
77
77
|
get_repo("persons_repo"),
|
|
78
78
|
get_repo("links_repo"),
|
|
79
79
|
),
|
|
@@ -11,6 +11,7 @@ log = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class Solr(object):
|
|
14
|
+
|
|
14
15
|
def __init__(
|
|
15
16
|
self,
|
|
16
17
|
solr_url: Optional[str] = None,
|
|
@@ -19,42 +20,29 @@ class Solr(object):
|
|
|
19
20
|
always_commit: bool = False,
|
|
20
21
|
timeout: int = 5 * 60,
|
|
21
22
|
) -> None:
|
|
22
|
-
self.solr_url = solr_url
|
|
23
|
-
self.solr_core = solr_core
|
|
24
|
-
self.auth = solr_auth
|
|
23
|
+
self.solr_url = solr_url if solr_url else config["solr_url"]
|
|
24
|
+
self.solr_core = solr_core if solr_url else config["solr_url"]
|
|
25
|
+
self.auth = solr_auth if solr_url else config["solr_url"]
|
|
25
26
|
self.client = SolrClient(
|
|
26
27
|
self.endpoint,
|
|
27
28
|
auth=self.authentication,
|
|
28
29
|
always_commit=always_commit,
|
|
29
30
|
timeout=timeout,
|
|
30
31
|
)
|
|
32
|
+
# test connection to solr endpoint
|
|
33
|
+
# -> raises exception if connection fails
|
|
34
|
+
self.client.ping()
|
|
31
35
|
|
|
32
36
|
@property
|
|
33
37
|
def endpoint(self):
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
solr_url = self.solr_url if self.solr_url else config["solr_url"]
|
|
38
|
-
log.debug(f"configured solr url: {solr_url}")
|
|
39
|
-
if solr_url.startswith("http://"):
|
|
40
|
-
raise ValueError(
|
|
41
|
-
"Insecure solr url configured. "
|
|
42
|
-
"Please check your configuration and use https."
|
|
43
|
-
)
|
|
44
|
-
solr_core = self.solr_core if self.solr_core else config["solr_core"]
|
|
45
|
-
log.debug(f"configured solr core: {solr_core}")
|
|
46
|
-
_endpoint = urljoin(solr_url, solr_core)
|
|
47
|
-
log.info(f"initialized solr client with endpoint: {_endpoint}")
|
|
48
|
-
return _endpoint
|
|
38
|
+
endpoint = urljoin(self.solr_url, self.solr_core)
|
|
39
|
+
log.info(f"initialized solr client with endpoint: {endpoint}")
|
|
40
|
+
return endpoint
|
|
49
41
|
|
|
50
42
|
@property
|
|
51
43
|
def authentication(self):
|
|
52
|
-
if self.auth
|
|
53
|
-
username, password = (
|
|
54
|
-
self.auth.split(":")
|
|
55
|
-
if self.auth
|
|
56
|
-
else config["solr_auth"].split(":")
|
|
57
|
-
)
|
|
44
|
+
if self.auth:
|
|
45
|
+
username, password = self.auth.split(":")
|
|
58
46
|
return HTTPBasicAuth(username, password)
|
|
59
47
|
|
|
60
48
|
def index_documents(
|
|
@@ -69,7 +57,8 @@ class Solr(object):
|
|
|
69
57
|
if len(documents) <= offset + batch_size:
|
|
70
58
|
batch = documents[offset:]
|
|
71
59
|
else:
|
|
72
|
-
|
|
60
|
+
limit = offset + batch_size
|
|
61
|
+
batch = documents[offset:limit]
|
|
73
62
|
|
|
74
63
|
if len(batch) == 0:
|
|
75
64
|
break
|
|
@@ -1,224 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from .harvester_base import Harvester
|
|
3
|
-
from utils import sparql
|
|
4
|
-
from data_repositories.repository_n4eorganization import RepositoryN4EOrganization
|
|
5
|
-
|
|
6
|
-
log = logging.getLogger(__name__)
|
|
7
|
-
|
|
8
|
-
#harvester for Services https://nfdi4earth.pages.rwth-aachen.de/knowledgehub/nfdi4earth-kh-schema/Service/
|
|
9
|
-
class Service_Harvester(Harvester):
|
|
10
|
-
sparql_query = """
|
|
11
|
-
PREFIX fo: <http://www.w3.org/1999/XSL/Format#>
|
|
12
|
-
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
13
|
-
PREFIX dc: <http://purl.org/dc/elements/1.1/>
|
|
14
|
-
PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
|
|
15
|
-
prefix dcat: <http://www.w3.org/ns/dcat#>
|
|
16
|
-
prefix dct: <http://purl.org/dc/terms/>
|
|
17
|
-
prefix n4e: <http://nfdi4earth.de/ontology/>
|
|
18
|
-
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
19
|
-
|
|
20
|
-
SELECT ?subject ?predicate ?object ?contactpoint_email ?contactpoint_url
|
|
21
|
-
{
|
|
22
|
-
{
|
|
23
|
-
?subject rdf:type <http://schema.org/Service>.
|
|
24
|
-
?subject ?predicate ?object
|
|
25
|
-
FILTER (?predicate NOT IN (dcat:contactPoint, n4e:sourceSystem))
|
|
26
|
-
}
|
|
27
|
-
UNION {
|
|
28
|
-
VALUES ?predicate { dcat:contactPoint }
|
|
29
|
-
?subject rdf:type <http://schema.org/Service>;
|
|
30
|
-
dcat:contactPoint ?object.
|
|
31
|
-
optional { ?object vcard:hasEmail ?contactpoint_email. }
|
|
32
|
-
optional { ?object vcard:hasURL ?contactpoint_url. }
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
ORDER BY ?subject ?predicate
|
|
36
|
-
OFFSET %d
|
|
37
|
-
LIMIT %d
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def __init__(self, n4e_organizations_repo: RepositoryN4EOrganization, **kw):
|
|
41
|
-
super().__init__(**kw)
|
|
42
|
-
self.n4e_organizations_repo = n4e_organizations_repo
|
|
43
|
-
|
|
44
|
-
def harvest(self):
|
|
45
|
-
limit = 5000
|
|
46
|
-
# convert to list of repo documents for indexing
|
|
47
|
-
services = {} # repos dict
|
|
48
|
-
|
|
49
|
-
i = 0
|
|
50
|
-
hits = {}
|
|
51
|
-
# split sparql query
|
|
52
|
-
while True:
|
|
53
|
-
query_splitted = self.sparql_query % (limit * i, limit)
|
|
54
|
-
hits = sparql.execute_query(self.sparql_endpoint, query_splitted)
|
|
55
|
-
self.parse_response(hits, services)
|
|
56
|
-
|
|
57
|
-
i = i + 1
|
|
58
|
-
|
|
59
|
-
if len(hits) < limit:
|
|
60
|
-
break
|
|
61
|
-
|
|
62
|
-
services_list = []
|
|
63
|
-
for (
|
|
64
|
-
key,
|
|
65
|
-
value,
|
|
66
|
-
) in services.items(): # transform repos dict to list for indexing
|
|
67
|
-
service = value
|
|
68
|
-
if "mainTitle" not in service and len(service["name"]) > 0:
|
|
69
|
-
service["mainTitle"] = service["name"][0]
|
|
70
|
-
service["mainTitle"] = service["mainTitle"].strip()
|
|
71
|
-
services_list.append(service)
|
|
72
|
-
|
|
73
|
-
return services_list
|
|
74
|
-
|
|
75
|
-
def parse_response(
|
|
76
|
-
self, hits, services
|
|
77
|
-
):
|
|
78
|
-
for hit in hits:
|
|
79
|
-
subject = hit["subject"]["value"]
|
|
80
|
-
predicate = hit["predicate"]["value"]
|
|
81
|
-
object = hit["object"]["value"]
|
|
82
|
-
|
|
83
|
-
if subject not in services:
|
|
84
|
-
services[subject] = {}
|
|
85
|
-
services[subject]["uri"] = subject
|
|
86
|
-
services[subject]["id"] = self.getID(
|
|
87
|
-
subject
|
|
88
|
-
) # use ID from triple store also in Solr to ensure stable IDs
|
|
89
|
-
|
|
90
|
-
if predicate == "http://schema.org/name": # name
|
|
91
|
-
if (
|
|
92
|
-
"xml:lang" not in hit["object"]
|
|
93
|
-
or hit["object"]["xml:lang"] == "en"
|
|
94
|
-
): # use international name for orga name
|
|
95
|
-
self.addValue(
|
|
96
|
-
dict=services[subject],
|
|
97
|
-
attribute="name",
|
|
98
|
-
value=object,
|
|
99
|
-
)
|
|
100
|
-
services[subject]["mainTitle"] = object # mainTitle
|
|
101
|
-
if (
|
|
102
|
-
"name" not in services[subject]
|
|
103
|
-
or object not in services[subject]["name"]
|
|
104
|
-
): # prevent duplicates
|
|
105
|
-
self.addValue(
|
|
106
|
-
dict=services[subject],
|
|
107
|
-
attribute="name",
|
|
108
|
-
value=object,
|
|
109
|
-
)
|
|
110
|
-
elif predicate == "http://schema.org/description": # description
|
|
111
|
-
self.addValue(
|
|
112
|
-
dict=services[subject],
|
|
113
|
-
attribute="description",
|
|
114
|
-
value=object,
|
|
115
|
-
)
|
|
116
|
-
elif predicate == "http://schema.org/additionalType": # additionalType
|
|
117
|
-
self.addValue(
|
|
118
|
-
dict=services[subject],
|
|
119
|
-
attribute="additionalType",
|
|
120
|
-
value=object,
|
|
121
|
-
)
|
|
122
|
-
elif predicate == "http://schema.org/keywords": # keyword
|
|
123
|
-
self.addValue(
|
|
124
|
-
dict=services[subject],
|
|
125
|
-
attribute="keyword",
|
|
126
|
-
value=object,
|
|
127
|
-
)
|
|
128
|
-
elif predicate == "http://schema.org/url": # url
|
|
129
|
-
self.addValue(
|
|
130
|
-
dict=services[subject],
|
|
131
|
-
attribute="url",
|
|
132
|
-
value=object,
|
|
133
|
-
)
|
|
134
|
-
elif predicate == "http://nfdi4earth.de/ontology/serviceType": # serviceType
|
|
135
|
-
services[subject]["serviceType"] = object
|
|
136
|
-
elif predicate == "http://nfdi4earth.de/ontology/serviceHost": # serviceHost
|
|
137
|
-
services[subject]["serviceHost"] = object
|
|
138
|
-
elif predicate == "http://www.w3.org/ns/dcat#contactPoint": # contactPoint
|
|
139
|
-
if "contactpoint_email" in hit:
|
|
140
|
-
self.addValue(
|
|
141
|
-
services[subject],
|
|
142
|
-
"contactPoint" + self.flatten_separator + "email",
|
|
143
|
-
hit["contactpoint_email"]["value"],
|
|
144
|
-
)
|
|
145
|
-
if "contactpoint_url" in hit:
|
|
146
|
-
self.addValue(
|
|
147
|
-
services[subject],
|
|
148
|
-
"contactPoint" + self.flatten_separator + "url",
|
|
149
|
-
hit["contactpoint_url"]["value"],
|
|
150
|
-
)
|
|
151
|
-
elif (
|
|
152
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
|
|
153
|
-
): # sourceSystemID
|
|
154
|
-
self.addValue(
|
|
155
|
-
dict=services[subject],
|
|
156
|
-
attribute="sourceSystem" + self.flatten_separator + "id",
|
|
157
|
-
value=object,
|
|
158
|
-
)
|
|
159
|
-
elif (
|
|
160
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystem"
|
|
161
|
-
): # sourceSystem
|
|
162
|
-
if "sourceSystem_homepage" in hit:
|
|
163
|
-
self.addValue(
|
|
164
|
-
services[subject],
|
|
165
|
-
"sourceSystem" + self.flatten_separator + "homepage",
|
|
166
|
-
hit["sourceSystem_homepage"]["value"],
|
|
167
|
-
)
|
|
168
|
-
if "sourceSystem_title" in hit:
|
|
169
|
-
self.addValue(
|
|
170
|
-
services[subject],
|
|
171
|
-
"sourceSystem" + self.flatten_separator + "title",
|
|
172
|
-
hit["sourceSystem_title"]["value"],
|
|
173
|
-
)
|
|
174
|
-
elif (
|
|
175
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
|
|
176
|
-
): # sourceSystemURL
|
|
177
|
-
self.addValue(
|
|
178
|
-
dict=services[subject],
|
|
179
|
-
attribute="sourceSystemURL",
|
|
180
|
-
value=object,
|
|
181
|
-
)
|
|
182
|
-
elif (
|
|
183
|
-
predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
|
|
184
|
-
): # altLabel
|
|
185
|
-
self.addValue(
|
|
186
|
-
dict=services[subject],
|
|
187
|
-
attribute="altLabel",
|
|
188
|
-
value=object,
|
|
189
|
-
)
|
|
190
|
-
elif (
|
|
191
|
-
predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
192
|
-
): # type
|
|
193
|
-
self.addValue(
|
|
194
|
-
dict=services[subject],
|
|
195
|
-
attribute="type",
|
|
196
|
-
value=object
|
|
197
|
-
)
|
|
198
|
-
elif (
|
|
199
|
-
predicate == "http://nfdi4earth.de/ontology/serviceType"
|
|
200
|
-
): # serviceType
|
|
201
|
-
self.addValue(
|
|
202
|
-
dict=services[subject],
|
|
203
|
-
attribute="serviceType",
|
|
204
|
-
value=object
|
|
205
|
-
)
|
|
206
|
-
elif (
|
|
207
|
-
predicate == "http://nfdi4earth.de/ontology/serviceCategory"
|
|
208
|
-
): # serviceCategory
|
|
209
|
-
self.addValue(
|
|
210
|
-
dict=services[subject],
|
|
211
|
-
attribute="serviceCategory",
|
|
212
|
-
value=object
|
|
213
|
-
)
|
|
214
|
-
elif (
|
|
215
|
-
predicate == "http://nfdi4earth.de/ontology/idHostingInstitution"
|
|
216
|
-
): #idHostingInstitution #hostingInstitution_name #isN4EOperated
|
|
217
|
-
host_rorID = object
|
|
218
|
-
services[subject]["idHostingInstitution"] = host_rorID #idHostingInstitution
|
|
219
|
-
|
|
220
|
-
n4e_organization = self.n4e_organizations_repo.get_n4e_organization_by_rorID(host_rorID) #check if rorID belongs to a organization that is n4e member
|
|
221
|
-
if n4e_organization is not None:
|
|
222
|
-
services[subject]["isN4EOperated"] = True #isN4EOperated
|
|
223
|
-
services[subject]["hostingInstitution_name"] = n4e_organization["name"] #hostingInstitution_name currently only for n4e operated services
|
|
224
|
-
#only index if `True`, do not index if `False`
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/data_repositories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_article.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_base.py
RENAMED
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_dataset.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev12 → onestop4all_indexer-2.8.0.dev14}/harvesters/harvester_document.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|