OneStop4All-Indexer 2.8.0.dev14__tar.gz → 2.8.0.dev20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20/OneStop4All_Indexer.egg-info}/PKG-INFO +1 -1
- {onestop4all_indexer-2.8.0.dev14/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev20}/PKG-INFO +1 -1
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_dataservice.py +117 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/setup.py +1 -1
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/cli.py +25 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/solr.py +2 -2
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/requires.txt +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_dataset.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_organization.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_service.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_software.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/configs.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/harvest.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/utils/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev20
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev20
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -427,3 +427,120 @@ iter_end={self.iteration_end}"""
|
|
|
427
427
|
return geom_wkt
|
|
428
428
|
else:
|
|
429
429
|
return geom_wkt
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
# Batched Harvester für DataServices
|
|
433
|
+
class DataService_HarvesterBatched(DataService_Harvester):
|
|
434
|
+
|
|
435
|
+
sparql_query_template = """
|
|
436
|
+
PREFIX fo: <http://www.w3.org/1999/XSL/Format#>
|
|
437
|
+
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
438
|
+
PREFIX dc: <http://purl.org/dc/elements/1.1/>
|
|
439
|
+
PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
|
|
440
|
+
prefix dcat: <http://www.w3.org/ns/dcat#>
|
|
441
|
+
prefix dct: <http://purl.org/dc/terms/>
|
|
442
|
+
prefix n4e: <http://nfdi4earth.de/ontology/>
|
|
443
|
+
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
444
|
+
|
|
445
|
+
SELECT ?subject ?predicate ?object ?contactpoint_email ?contactpoint_url ?sourceSystem_title ?sourceSystem_homepage
|
|
446
|
+
{
|
|
447
|
+
VALUES ?subject { %s }
|
|
448
|
+
{
|
|
449
|
+
?subject rdf:type dcat:DataService.
|
|
450
|
+
?subject ?predicate ?object.
|
|
451
|
+
FILTER (?predicate NOT IN (<http://purl.org/dc/terms/publisher>, dct:spatial, dcat:contactPoint, n4e:sourceSystem))
|
|
452
|
+
}
|
|
453
|
+
UNION{
|
|
454
|
+
VALUES ?predicate { <http://purl.org/dc/terms/publisher> }
|
|
455
|
+
?subject rdf:type dcat:DataService;
|
|
456
|
+
<http://purl.org/dc/terms/publisher> ?publisher.
|
|
457
|
+
?publisher <http://schema.org/name> ?object.
|
|
458
|
+
}
|
|
459
|
+
UNION{
|
|
460
|
+
VALUES ?predicate { dct:spatial }
|
|
461
|
+
?subject rdf:type dcat:DataService;
|
|
462
|
+
dct:spatial ?spatial.
|
|
463
|
+
?spatial dcat:bbox ?object
|
|
464
|
+
}
|
|
465
|
+
UNION {
|
|
466
|
+
VALUES ?predicate { dcat:contactPoint }
|
|
467
|
+
?subject rdf:type dcat:DataService;
|
|
468
|
+
dcat:contactPoint ?object.
|
|
469
|
+
optional { ?object vcard:hasEmail ?contactpoint_email. }
|
|
470
|
+
optional { ?object vcard:hasURL ?contactpoint_url. }
|
|
471
|
+
}
|
|
472
|
+
UNION{
|
|
473
|
+
VALUES ?predicate { n4e:sourceSystem }
|
|
474
|
+
?subject rdf:type dcat:DataService;
|
|
475
|
+
n4e:sourceSystem ?object.
|
|
476
|
+
optional {?object dct:title ?sourceSystem_title.}
|
|
477
|
+
optional {?object foaf:homepage ?sourceSystem_homepage.}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
def get_all_ids(self):
|
|
483
|
+
"""
|
|
484
|
+
Holt alle DataService-URIs aus dem Triple-Store, auch wenn der Endpoint ein Limit hat.
|
|
485
|
+
"""
|
|
486
|
+
subjects = set()
|
|
487
|
+
offset = 0
|
|
488
|
+
limit = 10000 # typisches Virtuoso-Limit, ggf. anpassen
|
|
489
|
+
while True:
|
|
490
|
+
query = f"""
|
|
491
|
+
PREFIX dcat: <http://www.w3.org/ns/dcat#>
|
|
492
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
493
|
+
SELECT DISTINCT ?subject WHERE {{ ?subject rdf:type dcat:DataService. }} OFFSET {offset} LIMIT {limit}
|
|
494
|
+
"""
|
|
495
|
+
hits = sparql.execute_query(self.sparql_endpoint, query)
|
|
496
|
+
batch_subjects = [hit["subject"]["value"] for hit in hits]
|
|
497
|
+
if not batch_subjects:
|
|
498
|
+
break
|
|
499
|
+
subjects.update(batch_subjects)
|
|
500
|
+
if len(batch_subjects) < limit:
|
|
501
|
+
break
|
|
502
|
+
offset += limit
|
|
503
|
+
return list(subjects)
|
|
504
|
+
|
|
505
|
+
def get_triples_for_subjects(self, subject_uris):
|
|
506
|
+
"""
|
|
507
|
+
Holt alle Tripel für eine Liste von Subject-URIs.
|
|
508
|
+
"""
|
|
509
|
+
values_str = " ".join(f"<{uri}>" for uri in subject_uris)
|
|
510
|
+
query = self.sparql_query_template % values_str
|
|
511
|
+
hits = sparql.execute_query(self.sparql_endpoint, query)
|
|
512
|
+
return hits
|
|
513
|
+
|
|
514
|
+
def harvest(self, records, batch_size=10):
|
|
515
|
+
services = {}
|
|
516
|
+
# dsh_batched.parse_response(triples, services)
|
|
517
|
+
while True:
|
|
518
|
+
# process in smaller sub-batches to avoid too long query string
|
|
519
|
+
records_sliced_sublist = records[:batch_size]
|
|
520
|
+
del records[:batch_size]
|
|
521
|
+
log.info(f"Remaining records in batch: {len(records)}")
|
|
522
|
+
if not records_sliced_sublist:
|
|
523
|
+
break
|
|
524
|
+
triples = self.get_triples_for_subjects(
|
|
525
|
+
records_sliced_sublist,
|
|
526
|
+
)
|
|
527
|
+
self.parse_response(triples, services)
|
|
528
|
+
|
|
529
|
+
self.solr_validator.close()
|
|
530
|
+
|
|
531
|
+
services_list = []
|
|
532
|
+
for (
|
|
533
|
+
key,
|
|
534
|
+
value,
|
|
535
|
+
) in services.items(): # transform repos dict to list for indexing
|
|
536
|
+
service = value
|
|
537
|
+
if "mainTitle" not in service:
|
|
538
|
+
if len(service.get("title", [])) > 0:
|
|
539
|
+
service["mainTitle"] = service["title"][0]
|
|
540
|
+
else:
|
|
541
|
+
log.warning(f"Service {service['id']} has no mainTitle")
|
|
542
|
+
continue # skip services without mainTitle
|
|
543
|
+
service["mainTitle"] = service["mainTitle"].strip()
|
|
544
|
+
services_list.append(service)
|
|
545
|
+
|
|
546
|
+
return services_list
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="OneStop4All-Indexer",
|
|
5
|
-
version="2.8.0.
|
|
5
|
+
version="2.8.0.dev20",
|
|
6
6
|
description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
|
|
7
7
|
author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
|
|
8
8
|
author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
|
|
@@ -127,6 +127,31 @@ def dataservices(**options):
|
|
|
127
127
|
)
|
|
128
128
|
|
|
129
129
|
|
|
130
|
+
@debug.command()
|
|
131
|
+
def dataservices_batched():
|
|
132
|
+
from utils import config
|
|
133
|
+
from harvesters.harvester_dataservice import DataService_HarvesterBatched
|
|
134
|
+
from data_repositories import RepositoryPerson, RepositoryTheme
|
|
135
|
+
|
|
136
|
+
print("und los")
|
|
137
|
+
print(config)
|
|
138
|
+
persons_repo = RepositoryPerson()
|
|
139
|
+
themes_repo = RepositoryTheme()
|
|
140
|
+
dsh_batched = DataService_HarvesterBatched(
|
|
141
|
+
persons_repo=persons_repo,
|
|
142
|
+
themes_repo=themes_repo,
|
|
143
|
+
solr_validation=False,
|
|
144
|
+
)
|
|
145
|
+
ids = dsh_batched.get_all_ids()
|
|
146
|
+
batch_size = 10
|
|
147
|
+
for i in range(0, len(ids), batch_size):
|
|
148
|
+
# print(f"Batch {i//batch_size}: IDs {i} to {i+batch_size}")
|
|
149
|
+
ids_batched = ids[i : i + batch_size]
|
|
150
|
+
print(f"Batch {i//batch_size}: IDs {ids_batched}")
|
|
151
|
+
triples = dsh_batched.get_triples_for_subjects(ids_batched)
|
|
152
|
+
print(f"Batch {i//batch_size}: Triples {len(triples)}")
|
|
153
|
+
|
|
154
|
+
|
|
130
155
|
@main.command()
|
|
131
156
|
@click.pass_context
|
|
132
157
|
def reset(ctx):
|
|
@@ -21,8 +21,8 @@ class Solr(object):
|
|
|
21
21
|
timeout: int = 5 * 60,
|
|
22
22
|
) -> None:
|
|
23
23
|
self.solr_url = solr_url if solr_url else config["solr_url"]
|
|
24
|
-
self.solr_core = solr_core if
|
|
25
|
-
self.auth = solr_auth if
|
|
24
|
+
self.solr_core = solr_core if solr_core else config["solr_core"]
|
|
25
|
+
self.auth = solr_auth if solr_auth else config["solr_auth"]
|
|
26
26
|
self.client = SolrClient(
|
|
27
27
|
self.endpoint,
|
|
28
28
|
auth=self.authentication,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/data_repositories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_article.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_base.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_dataset.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_document.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_service.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev14 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_software.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|