OneStop4All-Indexer 2.8.0.dev15__tar.gz → 2.8.0.dev20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20/OneStop4All_Indexer.egg-info}/PKG-INFO +1 -1
  2. {onestop4all_indexer-2.8.0.dev15/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev20}/PKG-INFO +1 -1
  3. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_dataservice.py +117 -0
  4. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/setup.py +1 -1
  5. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/cli.py +25 -0
  6. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/LICENSE +0 -0
  7. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
  8. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
  9. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
  10. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/requires.txt +0 -0
  11. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
  12. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/data_repositories/__init__.py +0 -0
  13. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_base.py +0 -0
  14. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_n4eorganization.py +0 -0
  15. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_person.py +0 -0
  16. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_resource_links.py +0 -0
  17. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/data_repositories/repository_theme.py +0 -0
  18. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/__init__.py +0 -0
  19. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_article.py +0 -0
  20. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_base.py +0 -0
  21. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_dataset.py +0 -0
  22. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_document.py +0 -0
  23. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_learningresource.py +0 -0
  24. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_metadatastandards.py +0 -0
  25. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_organization.py +0 -0
  26. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_repository.py +0 -0
  27. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_service.py +0 -0
  28. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/harvesters/harvester_software.py +0 -0
  29. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/pyproject.toml +0 -0
  30. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/setup.cfg +0 -0
  31. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/__init__.py +0 -0
  32. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/configs.py +0 -0
  33. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/harvest.py +0 -0
  34. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/solr.py +0 -0
  35. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/sparql.py +0 -0
  36. {onestop4all_indexer-2.8.0.dev15 → onestop4all_indexer-2.8.0.dev20}/utils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev15
3
+ Version: 2.8.0.dev20
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev15
3
+ Version: 2.8.0.dev20
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -427,3 +427,120 @@ iter_end={self.iteration_end}"""
427
427
  return geom_wkt
428
428
  else:
429
429
  return geom_wkt
430
+
431
+
432
+ # Batched Harvester für DataServices
433
+ class DataService_HarvesterBatched(DataService_Harvester):
434
+
435
+ sparql_query_template = """
436
+ PREFIX fo: <http://www.w3.org/1999/XSL/Format#>
437
+ PREFIX foaf: <http://xmlns.com/foaf/0.1/>
438
+ PREFIX dc: <http://purl.org/dc/elements/1.1/>
439
+ PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
440
+ prefix dcat: <http://www.w3.org/ns/dcat#>
441
+ prefix dct: <http://purl.org/dc/terms/>
442
+ prefix n4e: <http://nfdi4earth.de/ontology/>
443
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
444
+
445
+ SELECT ?subject ?predicate ?object ?contactpoint_email ?contactpoint_url ?sourceSystem_title ?sourceSystem_homepage
446
+ {
447
+ VALUES ?subject { %s }
448
+ {
449
+ ?subject rdf:type dcat:DataService.
450
+ ?subject ?predicate ?object.
451
+ FILTER (?predicate NOT IN (<http://purl.org/dc/terms/publisher>, dct:spatial, dcat:contactPoint, n4e:sourceSystem))
452
+ }
453
+ UNION{
454
+ VALUES ?predicate { <http://purl.org/dc/terms/publisher> }
455
+ ?subject rdf:type dcat:DataService;
456
+ <http://purl.org/dc/terms/publisher> ?publisher.
457
+ ?publisher <http://schema.org/name> ?object.
458
+ }
459
+ UNION{
460
+ VALUES ?predicate { dct:spatial }
461
+ ?subject rdf:type dcat:DataService;
462
+ dct:spatial ?spatial.
463
+ ?spatial dcat:bbox ?object
464
+ }
465
+ UNION {
466
+ VALUES ?predicate { dcat:contactPoint }
467
+ ?subject rdf:type dcat:DataService;
468
+ dcat:contactPoint ?object.
469
+ optional { ?object vcard:hasEmail ?contactpoint_email. }
470
+ optional { ?object vcard:hasURL ?contactpoint_url. }
471
+ }
472
+ UNION{
473
+ VALUES ?predicate { n4e:sourceSystem }
474
+ ?subject rdf:type dcat:DataService;
475
+ n4e:sourceSystem ?object.
476
+ optional {?object dct:title ?sourceSystem_title.}
477
+ optional {?object foaf:homepage ?sourceSystem_homepage.}
478
+ }
479
+ }
480
+ """
481
+
482
+ def get_all_ids(self):
483
+ """
484
+ Holt alle DataService-URIs aus dem Triple-Store, auch wenn der Endpoint ein Limit hat.
485
+ """
486
+ subjects = set()
487
+ offset = 0
488
+ limit = 10000 # typisches Virtuoso-Limit, ggf. anpassen
489
+ while True:
490
+ query = f"""
491
+ PREFIX dcat: <http://www.w3.org/ns/dcat#>
492
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
493
+ SELECT DISTINCT ?subject WHERE {{ ?subject rdf:type dcat:DataService. }} OFFSET {offset} LIMIT {limit}
494
+ """
495
+ hits = sparql.execute_query(self.sparql_endpoint, query)
496
+ batch_subjects = [hit["subject"]["value"] for hit in hits]
497
+ if not batch_subjects:
498
+ break
499
+ subjects.update(batch_subjects)
500
+ if len(batch_subjects) < limit:
501
+ break
502
+ offset += limit
503
+ return list(subjects)
504
+
505
+ def get_triples_for_subjects(self, subject_uris):
506
+ """
507
+ Holt alle Tripel für eine Liste von Subject-URIs.
508
+ """
509
+ values_str = " ".join(f"<{uri}>" for uri in subject_uris)
510
+ query = self.sparql_query_template % values_str
511
+ hits = sparql.execute_query(self.sparql_endpoint, query)
512
+ return hits
513
+
514
+ def harvest(self, records, batch_size=10):
515
+ services = {}
516
+ # dsh_batched.parse_response(triples, services)
517
+ while True:
518
+ # process in smaller sub-batches to avoid too long query string
519
+ records_sliced_sublist = records[:batch_size]
520
+ del records[:batch_size]
521
+ log.info(f"Remaining records in batch: {len(records)}")
522
+ if not records_sliced_sublist:
523
+ break
524
+ triples = self.get_triples_for_subjects(
525
+ records_sliced_sublist,
526
+ )
527
+ self.parse_response(triples, services)
528
+
529
+ self.solr_validator.close()
530
+
531
+ services_list = []
532
+ for (
533
+ key,
534
+ value,
535
+ ) in services.items(): # transform repos dict to list for indexing
536
+ service = value
537
+ if "mainTitle" not in service:
538
+ if len(service.get("title", [])) > 0:
539
+ service["mainTitle"] = service["title"][0]
540
+ else:
541
+ log.warning(f"Service {service['id']} has no mainTitle")
542
+ continue # skip services without mainTitle
543
+ service["mainTitle"] = service["mainTitle"].strip()
544
+ services_list.append(service)
545
+
546
+ return services_list
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="OneStop4All-Indexer",
5
- version="2.8.0.dev15",
5
+ version="2.8.0.dev20",
6
6
  description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
7
7
  author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
8
8
  author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
@@ -127,6 +127,31 @@ def dataservices(**options):
127
127
  )
128
128
 
129
129
 
130
+ @debug.command()
131
+ def dataservices_batched():
132
+ from utils import config
133
+ from harvesters.harvester_dataservice import DataService_HarvesterBatched
134
+ from data_repositories import RepositoryPerson, RepositoryTheme
135
+
136
+ print("und los")
137
+ print(config)
138
+ persons_repo = RepositoryPerson()
139
+ themes_repo = RepositoryTheme()
140
+ dsh_batched = DataService_HarvesterBatched(
141
+ persons_repo=persons_repo,
142
+ themes_repo=themes_repo,
143
+ solr_validation=False,
144
+ )
145
+ ids = dsh_batched.get_all_ids()
146
+ batch_size = 10
147
+ for i in range(0, len(ids), batch_size):
148
+ # print(f"Batch {i//batch_size}: IDs {i} to {i+batch_size}")
149
+ ids_batched = ids[i : i + batch_size]
150
+ print(f"Batch {i//batch_size}: IDs {ids_batched}")
151
+ triples = dsh_batched.get_triples_for_subjects(ids_batched)
152
+ print(f"Batch {i//batch_size}: Triples {len(triples)}")
153
+
154
+
130
155
  @main.command()
131
156
  @click.pass_context
132
157
  def reset(ctx):