OneStop4All-Indexer 2.8.0.dev9__tar.gz → 2.8.0.dev11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11/OneStop4All_Indexer.egg-info}/PKG-INFO +2 -1
  2. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/requires.txt +1 -0
  3. {onestop4all_indexer-2.8.0.dev9/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev11}/PKG-INFO +2 -1
  4. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_dataset.py +0 -1
  5. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/setup.py +5 -2
  6. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/solr.py +46 -21
  7. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/LICENSE +0 -0
  8. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
  9. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
  10. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
  11. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
  12. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/__init__.py +0 -0
  13. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_base.py +0 -0
  14. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_n4eorganization.py +0 -0
  15. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_person.py +0 -0
  16. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_resource_links.py +0 -0
  17. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_theme.py +0 -0
  18. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/__init__.py +0 -0
  19. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_article.py +0 -0
  20. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_base.py +0 -0
  21. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_dataservice.py +0 -0
  22. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_document.py +0 -0
  23. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_learningresource.py +0 -0
  24. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_metadatastandards.py +0 -0
  25. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_organization.py +0 -0
  26. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_repository.py +0 -0
  27. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_service.py +0 -0
  28. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_softwaresourcecode.py +0 -0
  29. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/pyproject.toml +0 -0
  30. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/setup.cfg +0 -0
  31. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/__init__.py +0 -0
  32. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/cli.py +0 -0
  33. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/configs.py +0 -0
  34. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/harvest.py +0 -0
  35. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/sparql.py +0 -0
  36. {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev9
3
+ Version: 2.8.0.dev11
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
13
13
  Requires-Dist: shapely~=2.0.5
14
14
  Provides-Extra: airflow
15
15
  Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
16
+ Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
16
17
  Dynamic: author
17
18
  Dynamic: author-email
18
19
  Dynamic: license-file
@@ -7,3 +7,4 @@ shapely~=2.0.5
7
7
 
8
8
  [airflow]
9
9
  apache-airflow==3.1.7
10
+ apache-airflow-providers-amazon[s3fs]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev9
3
+ Version: 2.8.0.dev11
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
13
13
  Requires-Dist: shapely~=2.0.5
14
14
  Provides-Extra: airflow
15
15
  Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
16
+ Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
16
17
  Dynamic: author
17
18
  Dynamic: author-email
18
19
  Dynamic: license-file
@@ -299,7 +299,6 @@ class Dataset_Harvester(HarvesterCordra):
299
299
  value=val,
300
300
  )
301
301
  if "downloadURL" in distribution:
302
- print(distribution["downloadURL"])
303
302
  for download_url in distribution["downloadURL"]:
304
303
  val = self.get_string_from_jsonld(
305
304
  download_url, subject
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="OneStop4All-Indexer",
5
- version="2.8.0.dev9",
5
+ version="2.8.0.dev11",
6
6
  description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
7
7
  author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
8
8
  author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
@@ -16,7 +16,10 @@ setup(
16
16
  "shapely ~= 2.0.5",
17
17
  ],
18
18
  extras_require={
19
- "airflow": ["apache-airflow==3.1.7"],
19
+ "airflow": [
20
+ "apache-airflow==3.1.7",
21
+ "apache-airflow-providers-amazon[s3fs]",
22
+ ],
20
23
  },
21
24
  include_package_data=True,
22
25
  entry_points={
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
 
3
- from pysolr import Solr as SolrBase
3
+ from pysolr import Solr as SolrClient
4
4
  from requests.auth import HTTPBasicAuth
5
5
  from typing import List, Dict, Optional, Literal
6
+ from urllib.parse import urljoin
6
7
 
7
8
  from utils import config
8
9
 
@@ -21,7 +22,7 @@ class Solr(object):
21
22
  self.solr_url = solr_url
22
23
  self.solr_core = solr_core
23
24
  self.auth = solr_auth
24
- self.solr = SolrBase(
25
+ self.client = SolrClient(
25
26
  self.endpoint,
26
27
  auth=self.authentication,
27
28
  always_commit=always_commit,
@@ -34,8 +35,17 @@ class Solr(object):
34
35
  # overwrite with initially given values
35
36
  # TODO: check if endpoint is reachable, if not raise error
36
37
  solr_url = self.solr_url if self.solr_url else config["solr_url"]
38
+ log.debug(f"configured solr url: {solr_url}")
39
+ if solr_url.startswith("http://"):
40
+ raise ValueError(
41
+ "Insecure solr url configured. "
42
+ "Please check your configuration and use https."
43
+ )
37
44
  solr_core = self.solr_core if self.solr_core else config["solr_core"]
38
- return f"{solr_url.rstrip('/')}/{solr_core}"
45
+ log.debug(f"configured solr core: {solr_core}")
46
+ _endpoint = urljoin(solr_url, solr_core)
47
+ log.info(f"initialized solr client with endpoint: {_endpoint}")
48
+ return _endpoint
39
49
 
40
50
  @property
41
51
  def authentication(self):
@@ -47,12 +57,11 @@ class Solr(object):
47
57
  )
48
58
  return HTTPBasicAuth(username, password)
49
59
 
50
- def index_documents(self, documents: List[Dict]) -> None:
60
+ def index_documents(
61
+ self, documents: List[Dict], commit=True, ping=True
62
+ ) -> None:
51
63
  # solr_endpoint = coreurl(solr_url, solr_core)
52
- log.info(
53
- f"start indexing {len(documents)} documents to {self.endpoint}"
54
- )
55
-
64
+ log.info(f"start indexing {len(documents)} documents")
56
65
  batch_size = 50000
57
66
  offset = 0
58
67
  iteration = 0
@@ -65,11 +74,11 @@ class Solr(object):
65
74
  if len(batch) == 0:
66
75
  break
67
76
 
68
- self.solr.ping()
77
+ self.client.ping()
69
78
  log.info("solr healtcheck successful")
70
79
  try:
71
80
  log.info(f"start adding batch of {len(batch)} documents")
72
- self.solr.add(batch)
81
+ self.client.add(batch)
73
82
  log.info(f"finished adding batch of {len(batch)} documents")
74
83
  except Exception as e:
75
84
  log.error(e)
@@ -80,17 +89,33 @@ class Solr(object):
80
89
  if len(batch) < batch_size:
81
90
  break
82
91
 
83
- log.info("commit changes to index")
84
- self.solr.ping()
85
- log.info("solr healtcheck successful")
86
- self.solr.commit()
87
- log.info("sucessfully commited changes to index")
92
+ if ping is True:
93
+ self.client.ping()
94
+ log.info("solr healtcheck successful")
95
+
96
+ if commit is True:
97
+ log.info("commit changes to index")
98
+ self.client.commit()
99
+ log.info("sucessfully commited changes to index")
88
100
  log.info("finished indexing")
89
101
 
102
+ def commit(self):
103
+ self.client.commit()
104
+
90
105
  def reset_index(self):
91
- log.info(f"reset index for: {self.endpoint}")
92
- self.solr.delete(q="*:*")
93
- self.solr.commit()
106
+ self.client.delete(q="*:*")
107
+ self.client.commit()
108
+
109
+ def exists(self, document_id: str) -> bool:
110
+ result = self.search(f"id:{document_id}")
111
+ return result.hits > 0
112
+
113
+ def search(self, *args, **kwargs):
114
+ return self.client.search(*args, **kwargs)
115
+
116
+ def delete(self, *args, **kwargs):
117
+ self.client.delete(*args, **kwargs)
118
+ self.client.commit()
94
119
 
95
120
 
96
121
  class SolrValidator(Solr):
@@ -105,7 +130,7 @@ class SolrValidator(Solr):
105
130
  }
106
131
 
107
132
  try:
108
- self.solr.add(dataset)
133
+ self.client.add(dataset)
109
134
  return True
110
135
  except Exception as e:
111
136
  log.error(e)
@@ -114,5 +139,5 @@ class SolrValidator(Solr):
114
139
  def close(self):
115
140
  # technically not necessary to check existence of test document,
116
141
  # but when urls are misconfigured, this will throw an error
117
- if self.solr.search("id:geomValidationTest").hits:
118
- self.solr.delete(q="id:geomValidationTest")
142
+ if self.exists("geomValidationTest"):
143
+ self.delete(id="geomValidationTest")