OneStop4All-Indexer 2.8.0.dev11__tar.gz → 2.8.0.dev13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13/OneStop4All_Indexer.egg-info}/PKG-INFO +1 -1
  2. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/SOURCES.txt +1 -1
  3. {onestop4all_indexer-2.8.0.dev11/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev13}/PKG-INFO +1 -1
  4. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/__init__.py +1 -1
  5. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_organization.py +91 -125
  6. onestop4all_indexer-2.8.0.dev13/harvesters/harvester_service.py +551 -0
  7. onestop4all_indexer-2.8.0.dev11/harvesters/harvester_softwaresourcecode.py → onestop4all_indexer-2.8.0.dev13/harvesters/harvester_software.py +5 -5
  8. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/setup.py +1 -1
  9. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/cli.py +1 -1
  10. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/harvest.py +2 -2
  11. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/solr.py +14 -25
  12. onestop4all_indexer-2.8.0.dev11/harvesters/harvester_service.py +0 -224
  13. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/LICENSE +0 -0
  14. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
  15. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
  16. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/requires.txt +0 -0
  17. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
  18. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/__init__.py +0 -0
  19. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_base.py +0 -0
  20. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_n4eorganization.py +0 -0
  21. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_person.py +0 -0
  22. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_resource_links.py +0 -0
  23. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/data_repositories/repository_theme.py +0 -0
  24. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_article.py +0 -0
  25. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_base.py +0 -0
  26. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_dataservice.py +0 -0
  27. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_dataset.py +0 -0
  28. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_document.py +0 -0
  29. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_learningresource.py +0 -0
  30. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_metadatastandards.py +0 -0
  31. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/harvesters/harvester_repository.py +0 -0
  32. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/pyproject.toml +0 -0
  33. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/setup.cfg +0 -0
  34. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/__init__.py +0 -0
  35. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/configs.py +0 -0
  36. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/sparql.py +0 -0
  37. {onestop4all_indexer-2.8.0.dev11 → onestop4all_indexer-2.8.0.dev13}/utils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev11
3
+ Version: 2.8.0.dev13
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -24,7 +24,7 @@ harvesters/harvester_metadatastandards.py
24
24
  harvesters/harvester_organization.py
25
25
  harvesters/harvester_repository.py
26
26
  harvesters/harvester_service.py
27
- harvesters/harvester_softwaresourcecode.py
27
+ harvesters/harvester_software.py
28
28
  utils/__init__.py
29
29
  utils/cli.py
30
30
  utils/configs.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev11
3
+ Version: 2.8.0.dev13
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -1,7 +1,7 @@
1
1
  from .harvester_repository import *
2
2
  from .harvester_organization import *
3
3
  from .harvester_article import *
4
- from .harvester_softwaresourcecode import *
4
+ from .harvester_software import *
5
5
  from .harvester_learningresource import *
6
6
  from .harvester_metadatastandards import *
7
7
  from .harvester_document import *
@@ -1,10 +1,10 @@
1
- from .harvester_base import Harvester
2
- from data_repositories.repository_person import RepositoryPerson
3
- from utils import sparql
4
- from utils import flatten_dict
5
- from utils import is_truthy
6
1
  import logging
7
2
 
3
+ from data_repositories.repository_person import RepositoryPerson
4
+ from utils import sparql, flatten_dict, is_truthy
5
+
6
+ from .harvester_base import Harvester
7
+
8
8
  log = logging.getLogger(__name__)
9
9
 
10
10
 
@@ -20,32 +20,37 @@ class Organization_Harvester(Harvester):
20
20
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
21
21
  PREFIX org: <http://www.w3.org/ns/org#>
22
22
  PREFIX schema: <http://schema.org/>
23
+ PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
23
24
 
24
25
  SELECT ?subject ?predicate ?object ?geo_as_wkt ?isN4EMember
25
26
  WHERE {
26
- ?subject rdf:type foaf:Organization .
27
+ {
28
+ SELECT DISTINCT ?subject
29
+ WHERE {
30
+ ?subject rdf:type foaf:Organization .
31
+ BIND(STRAFTER(STR(?subject), "/objects/") AS ?id)
32
+ FILTER(STRSTARTS(?id, "n4e/"))
33
+ FILTER(!CONTAINS(?id, "jsonPointer"))
34
+ FILTER EXISTS { ?subject n4e:sourceSystem ?source }
35
+ }
36
+ OFFSET %s
37
+ LIMIT %s
38
+ }
39
+
27
40
  ?subject ?predicate ?object .
28
- # Exclude messy data that was harvested from remote source, contains duplicates, etc
29
- FILTER EXISTS { ?subject n4e:sourceSystem ?source }
41
+
30
42
  OPTIONAL {
31
43
  ?subject geo:hasGeometry ?geometry .
32
44
  ?geometry geo:asWKT ?geo_as_wkt .
33
45
  }
46
+
34
47
  OPTIONAL {
35
- ?subject n4e:sourceSystem ?source_system .
36
- OPTIONAL { ?source_system dct:title ?sourceSystem_title . }
37
- OPTIONAL { ?source_system foaf:homepage ?sourceSystem_homepage . }
38
- }
39
- OPTIONAL {
40
- ?subject rdf:type foaf:Organization ;
41
- org:hasMembership ?membership .
48
+ ?subject org:hasMembership ?membership .
42
49
  ?membership org:organization ?n4eproject .
43
50
  ?n4eproject schema:url "https://nfdi4earth.de/"^^xsd:anyURI .
44
51
  BIND(true AS ?isN4EMember)
45
52
  }
46
53
  }
47
- OFFSET %d
48
- LIMIT %d
49
54
  """
50
55
 
51
56
  def __init__(self, persons_repo: RepositoryPerson, **kw):
@@ -83,11 +88,34 @@ class Organization_Harvester(Harvester):
83
88
  )
84
89
 
85
90
  organization_list = []
86
- for (
87
- key,
88
- value,
89
- ) in organizations.items(): # transform orga dict to list for indexing
91
+ for key, value in organizations.items(): # transform orga dict to list for indexing
90
92
  organization = value
93
+
94
+ # clean geometry: if more than one geometry is present,
95
+ # keep only the first one (should not happen, but just in case)
96
+ geom = organization.get("geometry", [])
97
+ if len(geom) > 1:
98
+ organization["geometry"] = geom[:1]
99
+
100
+ # clean rorId: if more than one rorId is present,
101
+ # keep only unique ones (should not happen, but just in case)
102
+ rorid = organization.get("rorId", [])
103
+ if len(rorid) > 1:
104
+ organization["rorId"] = list(set(rorid))
105
+
106
+ # Transform locations
107
+ localities = organization.get("locality", [])
108
+ countries = organization.get("countryName", [])
109
+ complete_locations = []
110
+
111
+ for loc, country in zip(localities, countries):
112
+ complete_locations.append(f"{loc}, {country}")
113
+
114
+ # Filter duplication
115
+ organization["location"] = list(dict.fromkeys(complete_locations))
116
+ organization["locality"] = list(dict.fromkeys(localities))
117
+ organization["countryName"] = list(dict.fromkeys(countries))
118
+
91
119
  # ensure mainTitle
92
120
  if (
93
121
  "mainTitle" not in organization
@@ -132,9 +160,23 @@ class Organization_Harvester(Harvester):
132
160
  assignto_dict[subject[0]][attribute] = []
133
161
  assignto_dict[subject[0]][attribute].extend(organization_name)
134
162
 
163
+ def does_object_exist(self, value, attribute: str, data: dict):
164
+ return attribute in data and value in data[attribute]
165
+
135
166
  def parse_response(
136
167
  self, hits, organizations, issuborganization, hasN4Econtact
137
168
  ):
169
+ PREDICATES = {
170
+ "http://xmlns.com/foaf/0.1/homepage": "homepage",
171
+ "http://www.w3.org/2002/07/owl#sameAs": "sameAs",
172
+ "http://w3id.org/nfdi4ing/metadata4ing#hasRorId": "rorId",
173
+ "http://www.w3.org/2004/02/skos/core#altLabel": "altLabel",
174
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": "type",
175
+ "http://nfdi4earth.de/ontology/sourceSystemURL": "sourceSystemURL",
176
+ "http://nfdi4earth.de/ontology/hasSignedCommitment": "hasSignedCommitment",
177
+ "http://nfdi4earth.de/ontology/sourceSystemID": "sourceSystem" + self.flatten_separator + "id"
178
+ }
179
+
138
180
  for hit in hits:
139
181
  subject = hit["subject"]["value"]
140
182
  predicate = hit["predicate"]["value"]
@@ -146,17 +188,22 @@ class Organization_Harvester(Harvester):
146
188
  organizations[subject]["id"] = self.getID(
147
189
  subject
148
190
  ) # use ID from triple store also in Solr to ensure stable IDs
191
+ organizations[subject]["locality"] = []
192
+ organizations[subject]["countryName"] = []
149
193
 
150
194
  # set geometry if available and not already set
151
195
  if (
152
196
  predicate == "http://www.opengis.net/ont/geosparql#hasGeometry"
153
- and ("geometry" not in organizations[subject]
154
- or hit["geo_as_wkt"]["value"] not in organizations[subject]["geometry"])
197
+ and (
198
+ "geometry" not in organizations[subject]
199
+ or hit["geo_as_wkt"]["value"]
200
+ not in organizations[subject]["geometry"]
201
+ )
155
202
  ):
156
203
  self.addValue(
157
204
  dict=organizations[subject],
158
205
  attribute="geometry",
159
- value=hit["geo_as_wkt"]["value"]
206
+ value=hit["geo_as_wkt"]["value"],
160
207
  )
161
208
 
162
209
  # set membership in N4E project
@@ -171,8 +218,9 @@ class Organization_Harvester(Harvester):
171
218
 
172
219
  if predicate == "http://schema.org/name": # name
173
220
  if (
174
- "xml:lang" not in hit["object"]
175
- or hit["object"]["xml:lang"] == "en"
221
+ ("xml:lang" not in hit["object"]
222
+ or hit["object"]["xml:lang"] == "en")
223
+ and not self.does_object_exist(object, "name", organizations[subject])
176
224
  ): # use international name for orga name
177
225
  self.addValue(
178
226
  dict=organizations[subject],
@@ -181,60 +229,30 @@ class Organization_Harvester(Harvester):
181
229
  )
182
230
  organizations[subject]["mainTitle"] = object # mainTitle
183
231
  if (
184
- "name_alt" not in organizations[subject]
185
- or object not in organizations[subject]["name_alt"]
232
+ not self.does_object_exist(object, "name_alt", organizations[subject])
186
233
  ): # prevent duplicates
187
234
  self.addValue(
188
235
  dict=organizations[subject],
189
236
  attribute="name_alt",
190
237
  value=object,
191
238
  )
192
- elif predicate == "http://xmlns.com/foaf/0.1/homepage": # homepage
193
- self.addValue(
194
- dict=organizations[subject],
195
- attribute="homepage",
196
- value=object,
197
- )
198
- elif (
199
- predicate == "http://www.w3.org/2006/vcard/ns#locality"
200
- ): # locality
201
- self.addValue(
202
- dict=organizations[subject],
203
- attribute="locality",
204
- value=object,
205
- )
206
- elif (
207
- predicate == "http://www.w3.org/2006/vcard/ns#country-name"
208
- ): # countryName
209
- self.addValue(
210
- dict=organizations[subject],
211
- attribute="countryName",
212
- value=object,
213
- )
214
- elif predicate == "http://www.w3.org/2002/07/owl#sameAs": # sameAs
215
- self.addValue(
216
- dict=organizations[subject],
217
- attribute="sameAs",
218
- value=object,
219
- )
220
- elif (
221
- predicate == "http://w3id.org/nfdi4ing/metadata4ing#hasRorId"
222
- ): # rorId
223
- self.addValue(
224
- dict=organizations[subject],
225
- attribute="rorId",
226
- value=object,
227
- )
228
- elif (
229
- predicate == "http://www.w3.org/2004/02/skos/core#altLabel"
230
- ): # altLabel
231
- self.addValue(
232
- dict=organizations[subject],
233
- attribute="altLabel",
234
- value=object,
235
- )
239
+
240
+ elif predicate == "http://www.w3.org/2006/vcard/ns#locality":
241
+ organizations[subject]["locality"].append(object)
242
+
243
+ elif predicate == "http://www.w3.org/2006/vcard/ns#country-name":
244
+ organizations[subject]["countryName"].append(object)
245
+
246
+ elif predicate in PREDICATES:
247
+ attribute = PREDICATES[predicate]
248
+ if not self.does_object_exist(object, attribute, organizations[subject]):
249
+ self.addValue(
250
+ dict=organizations[subject],
251
+ attribute=attribute,
252
+ value=object)
236
253
  elif (
237
254
  predicate == "http://www.w3.org/ns/org#subOrganizationOf"
255
+ and (subject, object) not in issuborganization
238
256
  ): # subOrganizationOf
239
257
  issuborganization.append(
240
258
  (subject, object)
@@ -242,60 +260,8 @@ class Organization_Harvester(Harvester):
242
260
  elif (
243
261
  predicate
244
262
  == "http://nfdi4earth.de/ontology/hasNFDI4EarthContactPerson"
263
+ and (subject, object) not in hasN4Econtact
245
264
  ): # NFDI4EarthContactPerson
246
265
  hasN4Econtact.append(
247
266
  (subject, object)
248
267
  ) # store, resolve contact info later
249
- elif (
250
- predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
251
- ): # type
252
- self.addValue(
253
- dict=organizations[subject], attribute="type", value=object
254
- )
255
- elif (
256
- predicate == "http://nfdi4earth.de/ontology/sourceSystem"
257
- ): # sourceSystem
258
- if "sourceSystem_homepage" in hit:
259
- self.addValue(
260
- organizations[subject],
261
- "sourceSystem" + self.flatten_separator + "homepage",
262
- hit["sourceSystem_homepage"]["value"],
263
- )
264
- if "sourceSystem_title" in hit:
265
- self.addValue(
266
- organizations[subject],
267
- "sourceSystem" + self.flatten_separator + "title",
268
- hit["sourceSystem_title"]["value"],
269
- )
270
- elif (
271
- predicate == "http://nfdi4earth.de/ontology/sourceSystemID"
272
- ): # sourceSystemID
273
- if (
274
- "sourceSystem" + self.flatten_separator + "id"
275
- not in organizations[subject]
276
- ):
277
- # only set if not already present
278
- self.addValue(
279
- dict=organizations[subject],
280
- attribute="sourceSystem"
281
- + self.flatten_separator
282
- + "id",
283
- value=object,
284
- )
285
- elif (
286
- predicate == "http://nfdi4earth.de/ontology/sourceSystemURL"
287
- ): # sourceSystemURL
288
- self.addValue(
289
- dict=organizations[subject],
290
- attribute="sourceSystemURL",
291
- value=object,
292
- )
293
- elif (
294
- predicate
295
- == "http://nfdi4earth.de/ontology/hasSignedCommitment"
296
- ): # hasSignedCommitment
297
- self.addValue(
298
- dict=organizations[subject],
299
- attribute="hasSignedCommitment",
300
- value=object,
301
- )