OneStop4All-Indexer 2.8.0.dev9__tar.gz → 2.8.0.dev11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11/OneStop4All_Indexer.egg-info}/PKG-INFO +2 -1
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/requires.txt +1 -0
- {onestop4all_indexer-2.8.0.dev9/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev11}/PKG-INFO +2 -1
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_dataset.py +0 -1
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/setup.py +5 -2
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/solr.py +46 -21
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_dataservice.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_organization.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_service.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_softwaresourcecode.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/cli.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/configs.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/harvest.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/utils/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev11
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
|
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
14
|
Provides-Extra: airflow
|
|
15
15
|
Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
|
|
16
|
+
Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
|
|
16
17
|
Dynamic: author
|
|
17
18
|
Dynamic: author-email
|
|
18
19
|
Dynamic: license-file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev11
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
|
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
14
|
Provides-Extra: airflow
|
|
15
15
|
Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
|
|
16
|
+
Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
|
|
16
17
|
Dynamic: author
|
|
17
18
|
Dynamic: author-email
|
|
18
19
|
Dynamic: license-file
|
{onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_dataset.py
RENAMED
|
@@ -299,7 +299,6 @@ class Dataset_Harvester(HarvesterCordra):
|
|
|
299
299
|
value=val,
|
|
300
300
|
)
|
|
301
301
|
if "downloadURL" in distribution:
|
|
302
|
-
print(distribution["downloadURL"])
|
|
303
302
|
for download_url in distribution["downloadURL"]:
|
|
304
303
|
val = self.get_string_from_jsonld(
|
|
305
304
|
download_url, subject
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="OneStop4All-Indexer",
|
|
5
|
-
version="2.8.0.
|
|
5
|
+
version="2.8.0.dev11",
|
|
6
6
|
description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
|
|
7
7
|
author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
|
|
8
8
|
author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
|
|
@@ -16,7 +16,10 @@ setup(
|
|
|
16
16
|
"shapely ~= 2.0.5",
|
|
17
17
|
],
|
|
18
18
|
extras_require={
|
|
19
|
-
"airflow": [
|
|
19
|
+
"airflow": [
|
|
20
|
+
"apache-airflow==3.1.7",
|
|
21
|
+
"apache-airflow-providers-amazon[s3fs]",
|
|
22
|
+
],
|
|
20
23
|
},
|
|
21
24
|
include_package_data=True,
|
|
22
25
|
entry_points={
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
-
from pysolr import Solr as
|
|
3
|
+
from pysolr import Solr as SolrClient
|
|
4
4
|
from requests.auth import HTTPBasicAuth
|
|
5
5
|
from typing import List, Dict, Optional, Literal
|
|
6
|
+
from urllib.parse import urljoin
|
|
6
7
|
|
|
7
8
|
from utils import config
|
|
8
9
|
|
|
@@ -21,7 +22,7 @@ class Solr(object):
|
|
|
21
22
|
self.solr_url = solr_url
|
|
22
23
|
self.solr_core = solr_core
|
|
23
24
|
self.auth = solr_auth
|
|
24
|
-
self.
|
|
25
|
+
self.client = SolrClient(
|
|
25
26
|
self.endpoint,
|
|
26
27
|
auth=self.authentication,
|
|
27
28
|
always_commit=always_commit,
|
|
@@ -34,8 +35,17 @@ class Solr(object):
|
|
|
34
35
|
# overwrite with initially given values
|
|
35
36
|
# TODO: check if endpoint is reachable, if not raise error
|
|
36
37
|
solr_url = self.solr_url if self.solr_url else config["solr_url"]
|
|
38
|
+
log.debug(f"configured solr url: {solr_url}")
|
|
39
|
+
if solr_url.startswith("http://"):
|
|
40
|
+
raise ValueError(
|
|
41
|
+
"Insecure solr url configured. "
|
|
42
|
+
"Please check your configuration and use https."
|
|
43
|
+
)
|
|
37
44
|
solr_core = self.solr_core if self.solr_core else config["solr_core"]
|
|
38
|
-
|
|
45
|
+
log.debug(f"configured solr core: {solr_core}")
|
|
46
|
+
_endpoint = urljoin(solr_url, solr_core)
|
|
47
|
+
log.info(f"initialized solr client with endpoint: {_endpoint}")
|
|
48
|
+
return _endpoint
|
|
39
49
|
|
|
40
50
|
@property
|
|
41
51
|
def authentication(self):
|
|
@@ -47,12 +57,11 @@ class Solr(object):
|
|
|
47
57
|
)
|
|
48
58
|
return HTTPBasicAuth(username, password)
|
|
49
59
|
|
|
50
|
-
def index_documents(
|
|
60
|
+
def index_documents(
|
|
61
|
+
self, documents: List[Dict], commit=True, ping=True
|
|
62
|
+
) -> None:
|
|
51
63
|
# solr_endpoint = coreurl(solr_url, solr_core)
|
|
52
|
-
log.info(
|
|
53
|
-
f"start indexing {len(documents)} documents to {self.endpoint}"
|
|
54
|
-
)
|
|
55
|
-
|
|
64
|
+
log.info(f"start indexing {len(documents)} documents")
|
|
56
65
|
batch_size = 50000
|
|
57
66
|
offset = 0
|
|
58
67
|
iteration = 0
|
|
@@ -65,11 +74,11 @@ class Solr(object):
|
|
|
65
74
|
if len(batch) == 0:
|
|
66
75
|
break
|
|
67
76
|
|
|
68
|
-
self.
|
|
77
|
+
self.client.ping()
|
|
69
78
|
log.info("solr healtcheck successful")
|
|
70
79
|
try:
|
|
71
80
|
log.info(f"start adding batch of {len(batch)} documents")
|
|
72
|
-
self.
|
|
81
|
+
self.client.add(batch)
|
|
73
82
|
log.info(f"finished adding batch of {len(batch)} documents")
|
|
74
83
|
except Exception as e:
|
|
75
84
|
log.error(e)
|
|
@@ -80,17 +89,33 @@ class Solr(object):
|
|
|
80
89
|
if len(batch) < batch_size:
|
|
81
90
|
break
|
|
82
91
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
92
|
+
if ping is True:
|
|
93
|
+
self.client.ping()
|
|
94
|
+
log.info("solr healtcheck successful")
|
|
95
|
+
|
|
96
|
+
if commit is True:
|
|
97
|
+
log.info("commit changes to index")
|
|
98
|
+
self.client.commit()
|
|
99
|
+
log.info("sucessfully commited changes to index")
|
|
88
100
|
log.info("finished indexing")
|
|
89
101
|
|
|
102
|
+
def commit(self):
|
|
103
|
+
self.client.commit()
|
|
104
|
+
|
|
90
105
|
def reset_index(self):
|
|
91
|
-
|
|
92
|
-
self.
|
|
93
|
-
|
|
106
|
+
self.client.delete(q="*:*")
|
|
107
|
+
self.client.commit()
|
|
108
|
+
|
|
109
|
+
def exists(self, document_id: str) -> bool:
|
|
110
|
+
result = self.search(f"id:{document_id}")
|
|
111
|
+
return result.hits > 0
|
|
112
|
+
|
|
113
|
+
def search(self, *args, **kwargs):
|
|
114
|
+
return self.client.search(*args, **kwargs)
|
|
115
|
+
|
|
116
|
+
def delete(self, *args, **kwargs):
|
|
117
|
+
self.client.delete(*args, **kwargs)
|
|
118
|
+
self.client.commit()
|
|
94
119
|
|
|
95
120
|
|
|
96
121
|
class SolrValidator(Solr):
|
|
@@ -105,7 +130,7 @@ class SolrValidator(Solr):
|
|
|
105
130
|
}
|
|
106
131
|
|
|
107
132
|
try:
|
|
108
|
-
self.
|
|
133
|
+
self.client.add(dataset)
|
|
109
134
|
return True
|
|
110
135
|
except Exception as e:
|
|
111
136
|
log.error(e)
|
|
@@ -114,5 +139,5 @@ class SolrValidator(Solr):
|
|
|
114
139
|
def close(self):
|
|
115
140
|
# technically not necessary to check existence of test document,
|
|
116
141
|
# but when urls are misconfigured, this will throw an error
|
|
117
|
-
if self.
|
|
118
|
-
self.
|
|
142
|
+
if self.exists("geomValidationTest"):
|
|
143
|
+
self.delete(id="geomValidationTest")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/data_repositories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_article.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_base.py
RENAMED
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_document.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev9 → onestop4all_indexer-2.8.0.dev11}/harvesters/harvester_service.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|