OneStop4All-Indexer 2.8.0.dev11__tar.gz → 2.8.0.dev13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13/OneStop4All_Indexer.egg-info}/PKG-INFO +1 -1
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/SOURCES.txt +1 -1
- {onestop4all_indexer-2.8.0.dev11/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev13}/PKG-INFO +1 -1
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/__init__.py +1 -1
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_organization.py +91 -125
- onestop4all_indexer-2.8.0.dev13/harvesters/harvester_service.py +551 -0
- onestop4all_indexer-2.8.0.dev11/harvesters/harvester_softwaresourcecode.py → onestop4all_indexer-2.8.0.dev13/harvesters/harvester_software.py +5 -5
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/setup.py +1 -1
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/cli.py +1 -1
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/harvest.py +2 -2
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/solr.py +14 -25
- onestop4all_indexer-2.8.0.dev11/harvesters/harvester_service.py +0 -224
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/requires.txt +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_dataservice.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_dataset.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/configs.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev13
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -24,7 +24,7 @@ harvesters/harvester_metadatastandards.py
|
|
|
24
24
|
harvesters/harvester_organization.py
|
|
25
25
|
harvesters/harvester_repository.py
|
|
26
26
|
harvesters/harvester_service.py
|
|
27
|
-
harvesters/
|
|
27
|
+
harvesters/harvester_software.py
|
|
28
28
|
utils/__init__.py
|
|
29
29
|
utils/cli.py
|
|
30
30
|
utils/configs.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev13
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from .harvester_repository import *
|
|
2
2
|
from .harvester_organization import *
|
|
3
3
|
from .harvester_article import *
|
|
4
|
-
from .
|
|
4
|
+
from .harvester_software import *
|
|
5
5
|
from .harvester_learningresource import *
|
|
6
6
|
from .harvester_metadatastandards import *
|
|
7
7
|
from .harvester_document import *
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from .harvester_base import Harvester
|
|
2
|
-
from data_repositories.repository_person import RepositoryPerson
|
|
3
|
-
from utils import sparql
|
|
4
|
-
from utils import flatten_dict
|
|
5
|
-
from utils import is_truthy
|
|
6
1
|
import logging
|
|
7
2
|
|
|
3
|
+
from data_repositories.repository_person import RepositoryPerson
|
|
4
|
+
from utils import sparql, flatten_dict, is_truthy
|
|
5
|
+
|
|
6
|
+
from .harvester_base import Harvester
|
|
7
|
+
|
|
8
8
|
log = logging.getLogger(__name__)
|
|
9
9
|
|
|
10
10
|
|
|
@@ -20,32 +20,37 @@ class Organization_Harvester(Harvester):
|
|
|
20
20
|
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
21
21
|
PREFIX org: <http://www.w3.org/ns/org#>
|
|
22
22
|
PREFIX schema: <http://schema.org/>
|
|
23
|
+
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|
23
24
|
|
|
24
25
|
SELECT ?subject ?predicate ?object ?geo_as_wkt ?isN4EMember
|
|
25
26
|
WHERE {
|
|
26
|
-
|
|
27
|
+
{
|
|
28
|
+
SELECT DISTINCT ?subject
|
|
29
|
+
WHERE {
|
|
30
|
+
?subject rdf:type foaf:Organization .
|
|
31
|
+
BIND(STRAFTER(STR(?subject), "/objects/") AS ?id)
|
|
32
|
+
FILTER(STRSTARTS(?id, "n4e/"))
|
|
33
|
+
FILTER(!CONTAINS(?id, "jsonPointer"))
|
|
34
|
+
FILTER EXISTS { ?subject n4e:sourceSystem ?source }
|
|
35
|
+
}
|
|
36
|
+
OFFSET %s
|
|
37
|
+
LIMIT %s
|
|
38
|
+
}
|
|
39
|
+
|
|
27
40
|
?subject ?predicate ?object .
|
|
28
|
-
|
|
29
|
-
FILTER EXISTS { ?subject n4e:sourceSystem ?source }
|
|
41
|
+
|
|
30
42
|
OPTIONAL {
|
|
31
43
|
?subject geo:hasGeometry ?geometry .
|
|
32
44
|
?geometry geo:asWKT ?geo_as_wkt .
|
|
33
45
|
}
|
|
46
|
+
|
|
34
47
|
OPTIONAL {
|
|
35
|
-
?subject
|
|
36
|
-
OPTIONAL { ?source_system dct:title ?sourceSystem_title . }
|
|
37
|
-
OPTIONAL { ?source_system foaf:homepage ?sourceSystem_homepage . }
|
|
38
|
-
}
|
|
39
|
-
OPTIONAL {
|
|
40
|
-
?subject rdf:type foaf:Organization ;
|
|
41
|
-
org:hasMembership ?membership .
|
|
48
|
+
?subject org:hasMembership ?membership .
|
|
42
49
|
?membership org:organization ?n4eproject .
|
|
43
50
|
?n4eproject schema:url "https://nfdi4earth.de/"^^xsd:anyURI .
|
|
44
51
|
BIND(true AS ?isN4EMember)
|
|
45
52
|
}
|
|
46
53
|
}
|
|
47
|
-
OFFSET %d
|
|
48
|
-
LIMIT %d
|
|
49
54
|
"""
|
|
50
55
|
|
|
51
56
|
def __init__(self, persons_repo: RepositoryPerson, **kw):
|
|
@@ -83,11 +88,34 @@ class Organization_Harvester(Harvester):
|
|
|
83
88
|
)
|
|
84
89
|
|
|
85
90
|
organization_list = []
|
|
86
|
-
for (
|
|
87
|
-
key,
|
|
88
|
-
value,
|
|
89
|
-
) in organizations.items(): # transform orga dict to list for indexing
|
|
91
|
+
for key, value in organizations.items(): # transform orga dict to list for indexing
|
|
90
92
|
organization = value
|
|
93
|
+
|
|
94
|
+
# clean geometry: if more than one geometry is present,
|
|
95
|
+
# keep only the first one (should not happen, but just in case)
|
|
96
|
+
geom = organization.get("geometry", [])
|
|
97
|
+
if len(geom) > 1:
|
|
98
|
+
organization["geometry"] = geom[:1]
|
|
99
|
+
|
|
100
|
+
# clean rorId: if more than one rorId is present,
|
|
101
|
+
# keep only unique ones (should not happen, but just in case)
|
|
102
|
+
rorid = organization.get("rorId", [])
|
|
103
|
+
if len(rorid) > 1:
|
|
104
|
+
organization["rorId"] = list(set(rorid))
|
|
105
|
+
|
|
106
|
+
# Transform locations
|
|
107
|
+
localities = organization.get("locality", [])
|
|
108
|
+
countries = organization.get("countryName", [])
|
|
109
|
+
complete_locations = []
|
|
110
|
+
|
|
111
|
+
for loc, country in zip(localities, countries):
|
|
112
|
+
complete_locations.append(f"{loc}, {country}")
|
|
113
|
+
|
|
114
|
+
# Filter duplication
|
|
115
|
+
organization["location"] = list(dict.fromkeys(complete_locations))
|
|
116
|
+
organization["locality"] = list(dict.fromkeys(localities))
|
|
117
|
+
organization["countryName"] = list(dict.fromkeys(countries))
|
|
118
|
+
|
|
91
119
|
# ensure mainTitle
|
|
92
120
|
if (
|
|
93
121
|
"mainTitle" not in organization
|
|
@@ -132,9 +160,23 @@ class Organization_Harvester(Harvester):
|
|
|
132
160
|
assignto_dict[subject[0]][attribute] = []
|
|
133
161
|
assignto_dict[subject[0]][attribute].extend(organization_name)
|
|
134
162
|
|
|
163
|
+
def does_object_exist(self, value, attribute: str, data: dict):
|
|
164
|
+
return attribute in data and value in data[attribute]
|
|
165
|
+
|
|
135
166
|
def parse_response(
|
|
136
167
|
self, hits, organizations, issuborganization, hasN4Econtact
|
|
137
168
|
):
|
|
169
|
+
PREDICATES = {
|
|
170
|
+
"http://xmlns.com/foaf/0.1/homepage": "homepage",
|
|
171
|
+
"http://www.w3.org/2002/07/owl#sameAs": "sameAs",
|
|
172
|
+
"http://w3id.org/nfdi4ing/metadata4ing#hasRorId": "rorId",
|
|
173
|
+
"http://www.w3.org/2004/02/skos/core#altLabel": "altLabel",
|
|
174
|
+
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type": "type",
|
|
175
|
+
"http://nfdi4earth.de/ontology/sourceSystemURL": "sourceSystemURL",
|
|
176
|
+
"http://nfdi4earth.de/ontology/hasSignedCommitment": "hasSignedCommitment",
|
|
177
|
+
"http://nfdi4earth.de/ontology/sourceSystemID": "sourceSystem" + self.flatten_separator + "id"
|
|
178
|
+
}
|
|
179
|
+
|
|
138
180
|
for hit in hits:
|
|
139
181
|
subject = hit["subject"]["value"]
|
|
140
182
|
predicate = hit["predicate"]["value"]
|
|
@@ -146,17 +188,22 @@ class Organization_Harvester(Harvester):
|
|
|
146
188
|
organizations[subject]["id"] = self.getID(
|
|
147
189
|
subject
|
|
148
190
|
) # use ID from triple store also in Solr to ensure stable IDs
|
|
191
|
+
organizations[subject]["locality"] = []
|
|
192
|
+
organizations[subject]["countryName"] = []
|
|
149
193
|
|
|
150
194
|
# set geometry if available and not already set
|
|
151
195
|
if (
|
|
152
196
|
predicate == "http://www.opengis.net/ont/geosparql#hasGeometry"
|
|
153
|
-
and (
|
|
154
|
-
|
|
197
|
+
and (
|
|
198
|
+
"geometry" not in organizations[subject]
|
|
199
|
+
or hit["geo_as_wkt"]["value"]
|
|
200
|
+
not in organizations[subject]["geometry"]
|
|
201
|
+
)
|
|
155
202
|
):
|
|
156
203
|
self.addValue(
|
|
157
204
|
dict=organizations[subject],
|
|
158
205
|
attribute="geometry",
|
|
159
|
-
value=hit["geo_as_wkt"]["value"]
|
|
206
|
+
value=hit["geo_as_wkt"]["value"],
|
|
160
207
|
)
|
|
161
208
|
|
|
162
209
|
# set membership in N4E project
|
|
@@ -171,8 +218,9 @@ class Organization_Harvester(Harvester):
|
|
|
171
218
|
|
|
172
219
|
if predicate == "http://schema.org/name": # name
|
|
173
220
|
if (
|
|
174
|
-
"xml:lang" not in hit["object"]
|
|
175
|
-
or hit["object"]["xml:lang"] == "en"
|
|
221
|
+
("xml:lang" not in hit["object"]
|
|
222
|
+
or hit["object"]["xml:lang"] == "en")
|
|
223
|
+
and not self.does_object_exist(object, "name", organizations[subject])
|
|
176
224
|
): # use international name for orga name
|
|
177
225
|
self.addValue(
|
|
178
226
|
dict=organizations[subject],
|
|
@@ -181,60 +229,30 @@ class Organization_Harvester(Harvester):
|
|
|
181
229
|
)
|
|
182
230
|
organizations[subject]["mainTitle"] = object # mainTitle
|
|
183
231
|
if (
|
|
184
|
-
"name_alt"
|
|
185
|
-
or object not in organizations[subject]["name_alt"]
|
|
232
|
+
not self.does_object_exist(object, "name_alt", organizations[subject])
|
|
186
233
|
): # prevent duplicates
|
|
187
234
|
self.addValue(
|
|
188
235
|
dict=organizations[subject],
|
|
189
236
|
attribute="name_alt",
|
|
190
237
|
value=object,
|
|
191
238
|
)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
self.
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
elif (
|
|
207
|
-
predicate == "http://www.w3.org/2006/vcard/ns#country-name"
|
|
208
|
-
): # countryName
|
|
209
|
-
self.addValue(
|
|
210
|
-
dict=organizations[subject],
|
|
211
|
-
attribute="countryName",
|
|
212
|
-
value=object,
|
|
213
|
-
)
|
|
214
|
-
elif predicate == "http://www.w3.org/2002/07/owl#sameAs": # sameAs
|
|
215
|
-
self.addValue(
|
|
216
|
-
dict=organizations[subject],
|
|
217
|
-
attribute="sameAs",
|
|
218
|
-
value=object,
|
|
219
|
-
)
|
|
220
|
-
elif (
|
|
221
|
-
predicate == "http://w3id.org/nfdi4ing/metadata4ing#hasRorId"
|
|
222
|
-
): # rorId
|
|
223
|
-
self.addValue(
|
|
224
|
-
dict=organizations[subject],
|
|
225
|
-
attribute="rorId",
|
|
226
|
-
value=object,
|
|
227
|
-
)
|
|
228
|
-
elif (
|
|
229
|
-
predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
|
|
230
|
-
): # altLabel
|
|
231
|
-
self.addValue(
|
|
232
|
-
dict=organizations[subject],
|
|
233
|
-
attribute="altLabel",
|
|
234
|
-
value=object,
|
|
235
|
-
)
|
|
239
|
+
|
|
240
|
+
elif predicate == "http://www.w3.org/2006/vcard/ns#locality":
|
|
241
|
+
organizations[subject]["locality"].append(object)
|
|
242
|
+
|
|
243
|
+
elif predicate == "http://www.w3.org/2006/vcard/ns#country-name":
|
|
244
|
+
organizations[subject]["countryName"].append(object)
|
|
245
|
+
|
|
246
|
+
elif predicate in PREDICATES:
|
|
247
|
+
attribute = PREDICATES[predicate]
|
|
248
|
+
if not self.does_object_exist(object, attribute, organizations[subject]):
|
|
249
|
+
self.addValue(
|
|
250
|
+
dict=organizations[subject],
|
|
251
|
+
attribute=attribute,
|
|
252
|
+
value=object)
|
|
236
253
|
elif (
|
|
237
254
|
predicate == "http://www.w3.org/ns/org#subOrganizationOf"
|
|
255
|
+
and (subject, object) not in issuborganization
|
|
238
256
|
): # subOrganizationOf
|
|
239
257
|
issuborganization.append(
|
|
240
258
|
(subject, object)
|
|
@@ -242,60 +260,8 @@ class Organization_Harvester(Harvester):
|
|
|
242
260
|
elif (
|
|
243
261
|
predicate
|
|
244
262
|
== "http://nfdi4earth.de/ontology/hasNFDI4EarthContactPerson"
|
|
263
|
+
and (subject, object) not in hasN4Econtact
|
|
245
264
|
): # NFDI4EarthContactPerson
|
|
246
265
|
hasN4Econtact.append(
|
|
247
266
|
(subject, object)
|
|
248
267
|
) # store, resolve contact info later
|
|
249
|
-
elif (
|
|
250
|
-
predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
251
|
-
): # type
|
|
252
|
-
self.addValue(
|
|
253
|
-
dict=organizations[subject], attribute="type", value=object
|
|
254
|
-
)
|
|
255
|
-
elif (
|
|
256
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystem"
|
|
257
|
-
): # sourceSystem
|
|
258
|
-
if "sourceSystem_homepage" in hit:
|
|
259
|
-
self.addValue(
|
|
260
|
-
organizations[subject],
|
|
261
|
-
"sourceSystem" + self.flatten_separator + "homepage",
|
|
262
|
-
hit["sourceSystem_homepage"]["value"],
|
|
263
|
-
)
|
|
264
|
-
if "sourceSystem_title" in hit:
|
|
265
|
-
self.addValue(
|
|
266
|
-
organizations[subject],
|
|
267
|
-
"sourceSystem" + self.flatten_separator + "title",
|
|
268
|
-
hit["sourceSystem_title"]["value"],
|
|
269
|
-
)
|
|
270
|
-
elif (
|
|
271
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
|
|
272
|
-
): # sourceSystemID
|
|
273
|
-
if (
|
|
274
|
-
"sourceSystem" + self.flatten_separator + "id"
|
|
275
|
-
not in organizations[subject]
|
|
276
|
-
):
|
|
277
|
-
# only set if not already present
|
|
278
|
-
self.addValue(
|
|
279
|
-
dict=organizations[subject],
|
|
280
|
-
attribute="sourceSystem"
|
|
281
|
-
+ self.flatten_separator
|
|
282
|
-
+ "id",
|
|
283
|
-
value=object,
|
|
284
|
-
)
|
|
285
|
-
elif (
|
|
286
|
-
predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
|
|
287
|
-
): # sourceSystemURL
|
|
288
|
-
self.addValue(
|
|
289
|
-
dict=organizations[subject],
|
|
290
|
-
attribute="sourceSystemURL",
|
|
291
|
-
value=object,
|
|
292
|
-
)
|
|
293
|
-
elif (
|
|
294
|
-
predicate
|
|
295
|
-
== "http://nfdi4earth.de/ontology/hasSignedCommitment"
|
|
296
|
-
): # hasSignedCommitment
|
|
297
|
-
self.addValue(
|
|
298
|
-
dict=organizations[subject],
|
|
299
|
-
attribute="hasSignedCommitment",
|
|
300
|
-
value=object,
|
|
301
|
-
)
|