OneStop4All-Indexer 2.8.0.dev10__tar.gz → 2.8.0.dev12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12/OneStop4All_Indexer.egg-info}/PKG-INFO +2 -1
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/requires.txt +1 -0
- {onestop4all_indexer-2.8.0.dev10/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev12}/PKG-INFO +2 -1
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_dataset.py +0 -1
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_organization.py +28 -20
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/setup.py +5 -2
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/solr.py +14 -6
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_dataservice.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_service.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_softwaresourcecode.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/cli.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/configs.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/harvest.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev12
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
|
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
14
|
Provides-Extra: airflow
|
|
15
15
|
Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
|
|
16
|
+
Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
|
|
16
17
|
Dynamic: author
|
|
17
18
|
Dynamic: author-email
|
|
18
19
|
Dynamic: license-file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev12
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
|
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
14
|
Provides-Extra: airflow
|
|
15
15
|
Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
|
|
16
|
+
Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
|
|
16
17
|
Dynamic: author
|
|
17
18
|
Dynamic: author-email
|
|
18
19
|
Dynamic: license-file
|
{onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_dataset.py
RENAMED
|
@@ -299,7 +299,6 @@ class Dataset_Harvester(HarvesterCordra):
|
|
|
299
299
|
value=val,
|
|
300
300
|
)
|
|
301
301
|
if "downloadURL" in distribution:
|
|
302
|
-
print(distribution["downloadURL"])
|
|
303
302
|
for download_url in distribution["downloadURL"]:
|
|
304
303
|
val = self.get_string_from_jsonld(
|
|
305
304
|
download_url, subject
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from .harvester_base import Harvester
|
|
2
|
-
from data_repositories.repository_person import RepositoryPerson
|
|
3
|
-
from utils import sparql
|
|
4
|
-
from utils import flatten_dict
|
|
5
|
-
from utils import is_truthy
|
|
6
1
|
import logging
|
|
7
2
|
|
|
3
|
+
from data_repositories.repository_person import RepositoryPerson
|
|
4
|
+
from utils import sparql, flatten_dict, is_truthy
|
|
5
|
+
|
|
6
|
+
from .harvester_base import Harvester
|
|
7
|
+
|
|
8
8
|
log = logging.getLogger(__name__)
|
|
9
9
|
|
|
10
10
|
|
|
@@ -20,32 +20,37 @@ class Organization_Harvester(Harvester):
|
|
|
20
20
|
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
21
21
|
PREFIX org: <http://www.w3.org/ns/org#>
|
|
22
22
|
PREFIX schema: <http://schema.org/>
|
|
23
|
+
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|
23
24
|
|
|
24
25
|
SELECT ?subject ?predicate ?object ?geo_as_wkt ?isN4EMember
|
|
25
26
|
WHERE {
|
|
26
|
-
|
|
27
|
+
{
|
|
28
|
+
SELECT DISTINCT ?subject
|
|
29
|
+
WHERE {
|
|
30
|
+
?subject rdf:type foaf:Organization .
|
|
31
|
+
BIND(STRAFTER(STR(?subject), "/objects/") AS ?id)
|
|
32
|
+
FILTER(STRSTARTS(?id, "n4e/"))
|
|
33
|
+
FILTER(!CONTAINS(?id, "jsonPointer"))
|
|
34
|
+
FILTER EXISTS { ?subject n4e:sourceSystem ?source }
|
|
35
|
+
}
|
|
36
|
+
OFFSET %s
|
|
37
|
+
LIMIT %s
|
|
38
|
+
}
|
|
39
|
+
|
|
27
40
|
?subject ?predicate ?object .
|
|
28
|
-
|
|
29
|
-
FILTER EXISTS { ?subject n4e:sourceSystem ?source }
|
|
41
|
+
|
|
30
42
|
OPTIONAL {
|
|
31
43
|
?subject geo:hasGeometry ?geometry .
|
|
32
44
|
?geometry geo:asWKT ?geo_as_wkt .
|
|
33
45
|
}
|
|
46
|
+
|
|
34
47
|
OPTIONAL {
|
|
35
|
-
?subject
|
|
36
|
-
OPTIONAL { ?source_system dct:title ?sourceSystem_title . }
|
|
37
|
-
OPTIONAL { ?source_system foaf:homepage ?sourceSystem_homepage . }
|
|
38
|
-
}
|
|
39
|
-
OPTIONAL {
|
|
40
|
-
?subject rdf:type foaf:Organization ;
|
|
41
|
-
org:hasMembership ?membership .
|
|
48
|
+
?subject org:hasMembership ?membership .
|
|
42
49
|
?membership org:organization ?n4eproject .
|
|
43
50
|
?n4eproject schema:url "https://nfdi4earth.de/"^^xsd:anyURI .
|
|
44
51
|
BIND(true AS ?isN4EMember)
|
|
45
52
|
}
|
|
46
53
|
}
|
|
47
|
-
OFFSET %d
|
|
48
|
-
LIMIT %d
|
|
49
54
|
"""
|
|
50
55
|
|
|
51
56
|
def __init__(self, persons_repo: RepositoryPerson, **kw):
|
|
@@ -150,13 +155,16 @@ class Organization_Harvester(Harvester):
|
|
|
150
155
|
# set geometry if available and not already set
|
|
151
156
|
if (
|
|
152
157
|
predicate == "http://www.opengis.net/ont/geosparql#hasGeometry"
|
|
153
|
-
and (
|
|
154
|
-
|
|
158
|
+
and (
|
|
159
|
+
"geometry" not in organizations[subject]
|
|
160
|
+
or hit["geo_as_wkt"]["value"]
|
|
161
|
+
not in organizations[subject]["geometry"]
|
|
162
|
+
)
|
|
155
163
|
):
|
|
156
164
|
self.addValue(
|
|
157
165
|
dict=organizations[subject],
|
|
158
166
|
attribute="geometry",
|
|
159
|
-
value=hit["geo_as_wkt"]["value"]
|
|
167
|
+
value=hit["geo_as_wkt"]["value"],
|
|
160
168
|
)
|
|
161
169
|
|
|
162
170
|
# set membership in N4E project
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="OneStop4All-Indexer",
|
|
5
|
-
version="2.8.0.
|
|
5
|
+
version="2.8.0.dev12",
|
|
6
6
|
description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
|
|
7
7
|
author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
|
|
8
8
|
author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
|
|
@@ -16,7 +16,10 @@ setup(
|
|
|
16
16
|
"shapely ~= 2.0.5",
|
|
17
17
|
],
|
|
18
18
|
extras_require={
|
|
19
|
-
"airflow": [
|
|
19
|
+
"airflow": [
|
|
20
|
+
"apache-airflow==3.1.7",
|
|
21
|
+
"apache-airflow-providers-amazon[s3fs]",
|
|
22
|
+
],
|
|
20
23
|
},
|
|
21
24
|
include_package_data=True,
|
|
22
25
|
entry_points={
|
|
@@ -57,7 +57,9 @@ class Solr(object):
|
|
|
57
57
|
)
|
|
58
58
|
return HTTPBasicAuth(username, password)
|
|
59
59
|
|
|
60
|
-
def index_documents(
|
|
60
|
+
def index_documents(
|
|
61
|
+
self, documents: List[Dict], commit=True, ping=True
|
|
62
|
+
) -> None:
|
|
61
63
|
# solr_endpoint = coreurl(solr_url, solr_core)
|
|
62
64
|
log.info(f"start indexing {len(documents)} documents")
|
|
63
65
|
batch_size = 50000
|
|
@@ -87,13 +89,19 @@ class Solr(object):
|
|
|
87
89
|
if len(batch) < batch_size:
|
|
88
90
|
break
|
|
89
91
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
if ping is True:
|
|
93
|
+
self.client.ping()
|
|
94
|
+
log.info("solr healtcheck successful")
|
|
95
|
+
|
|
96
|
+
if commit is True:
|
|
97
|
+
log.info("commit changes to index")
|
|
98
|
+
self.client.commit()
|
|
99
|
+
log.info("sucessfully commited changes to index")
|
|
95
100
|
log.info("finished indexing")
|
|
96
101
|
|
|
102
|
+
def commit(self):
|
|
103
|
+
self.client.commit()
|
|
104
|
+
|
|
97
105
|
def reset_index(self):
|
|
98
106
|
self.client.delete(q="*:*")
|
|
99
107
|
self.client.commit()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_article.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_base.py
RENAMED
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_document.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_service.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|