OneStop4All-Indexer 2.8.0.dev9__tar.gz → 2.8.0.dev10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10/OneStop4All_Indexer.egg-info}/PKG-INFO +1 -1
  2. {onestop4all_indexer-2.8.0.dev9/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev10}/PKG-INFO +1 -1
  3. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/setup.py +1 -1
  4. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/solr.py +34 -17
  5. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/LICENSE +0 -0
  6. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
  7. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
  8. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
  9. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/OneStop4All_Indexer.egg-info/requires.txt +0 -0
  10. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
  11. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/data_repositories/__init__.py +0 -0
  12. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/data_repositories/repository_base.py +0 -0
  13. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/data_repositories/repository_n4eorganization.py +0 -0
  14. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/data_repositories/repository_person.py +0 -0
  15. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/data_repositories/repository_resource_links.py +0 -0
  16. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/data_repositories/repository_theme.py +0 -0
  17. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/__init__.py +0 -0
  18. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_article.py +0 -0
  19. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_base.py +0 -0
  20. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_dataservice.py +0 -0
  21. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_dataset.py +0 -0
  22. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_document.py +0 -0
  23. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_learningresource.py +0 -0
  24. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_metadatastandards.py +0 -0
  25. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_organization.py +0 -0
  26. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_repository.py +0 -0
  27. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_service.py +0 -0
  28. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/harvesters/harvester_softwaresourcecode.py +0 -0
  29. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/pyproject.toml +0 -0
  30. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/setup.cfg +0 -0
  31. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/__init__.py +0 -0
  32. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/cli.py +0 -0
  33. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/configs.py +0 -0
  34. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/harvest.py +0 -0
  35. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/sparql.py +0 -0
  36. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev10}/utils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev9
3
+ Version: 2.8.0.dev10
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev9
3
+ Version: 2.8.0.dev10
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="OneStop4All-Indexer",
5
- version="2.8.0.dev9",
5
+ version="2.8.0.dev10",
6
6
  description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
7
7
  author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
8
8
  author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
 
3
- from pysolr import Solr as SolrBase
3
+ from pysolr import Solr as SolrClient
4
4
  from requests.auth import HTTPBasicAuth
5
5
  from typing import List, Dict, Optional, Literal
6
+ from urllib.parse import urljoin
6
7
 
7
8
  from utils import config
8
9
 
@@ -21,7 +22,7 @@ class Solr(object):
21
22
  self.solr_url = solr_url
22
23
  self.solr_core = solr_core
23
24
  self.auth = solr_auth
24
- self.solr = SolrBase(
25
+ self.client = SolrClient(
25
26
  self.endpoint,
26
27
  auth=self.authentication,
27
28
  always_commit=always_commit,
@@ -34,8 +35,17 @@ class Solr(object):
34
35
  # overwrite with initially given values
35
36
  # TODO: check if endpoint is reachable, if not raise error
36
37
  solr_url = self.solr_url if self.solr_url else config["solr_url"]
38
+ log.debug(f"configured solr url: {solr_url}")
39
+ if solr_url.startswith("http://"):
40
+ raise ValueError(
41
+ "Insecure solr url configured. "
42
+ "Please check your configuration and use https."
43
+ )
37
44
  solr_core = self.solr_core if self.solr_core else config["solr_core"]
38
- return f"{solr_url.rstrip('/')}/{solr_core}"
45
+ log.debug(f"configured solr core: {solr_core}")
46
+ _endpoint = urljoin(solr_url, solr_core)
47
+ log.info(f"initialized solr client with endpoint: {_endpoint}")
48
+ return _endpoint
39
49
 
40
50
  @property
41
51
  def authentication(self):
@@ -49,10 +59,7 @@ class Solr(object):
49
59
 
50
60
  def index_documents(self, documents: List[Dict]) -> None:
51
61
  # solr_endpoint = coreurl(solr_url, solr_core)
52
- log.info(
53
- f"start indexing {len(documents)} documents to {self.endpoint}"
54
- )
55
-
62
+ log.info(f"start indexing {len(documents)} documents")
56
63
  batch_size = 50000
57
64
  offset = 0
58
65
  iteration = 0
@@ -65,11 +72,11 @@ class Solr(object):
65
72
  if len(batch) == 0:
66
73
  break
67
74
 
68
- self.solr.ping()
75
+ self.client.ping()
69
76
  log.info("solr healtcheck successful")
70
77
  try:
71
78
  log.info(f"start adding batch of {len(batch)} documents")
72
- self.solr.add(batch)
79
+ self.client.add(batch)
73
80
  log.info(f"finished adding batch of {len(batch)} documents")
74
81
  except Exception as e:
75
82
  log.error(e)
@@ -81,16 +88,26 @@ class Solr(object):
81
88
  break
82
89
 
83
90
  log.info("commit changes to index")
84
- self.solr.ping()
91
+ self.client.ping()
85
92
  log.info("solr healtcheck successful")
86
- self.solr.commit()
93
+ self.client.commit()
87
94
  log.info("sucessfully commited changes to index")
88
95
  log.info("finished indexing")
89
96
 
90
97
  def reset_index(self):
91
- log.info(f"reset index for: {self.endpoint}")
92
- self.solr.delete(q="*:*")
93
- self.solr.commit()
98
+ self.client.delete(q="*:*")
99
+ self.client.commit()
100
+
101
+ def exists(self, document_id: str) -> bool:
102
+ result = self.search(f"id:{document_id}")
103
+ return result.hits > 0
104
+
105
+ def search(self, *args, **kwargs):
106
+ return self.client.search(*args, **kwargs)
107
+
108
+ def delete(self, *args, **kwargs):
109
+ self.client.delete(*args, **kwargs)
110
+ self.client.commit()
94
111
 
95
112
 
96
113
  class SolrValidator(Solr):
@@ -105,7 +122,7 @@ class SolrValidator(Solr):
105
122
  }
106
123
 
107
124
  try:
108
- self.solr.add(dataset)
125
+ self.client.add(dataset)
109
126
  return True
110
127
  except Exception as e:
111
128
  log.error(e)
@@ -114,5 +131,5 @@ class SolrValidator(Solr):
114
131
  def close(self):
115
132
  # technically not necessary to check existence of test document,
116
133
  # but when urls are misconfigured, this will throw an error
117
- if self.solr.search("id:geomValidationTest").hits:
118
- self.solr.delete(q="id:geomValidationTest")
134
+ if self.exists("geomValidationTest"):
135
+ self.delete(id="geomValidationTest")