OneStop4All-Indexer 2.8.0.dev3__tar.gz → 2.8.0.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5/OneStop4All_Indexer.egg-info}/PKG-INFO +5 -2
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/SOURCES.txt +2 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/requires.txt +4 -1
- {onestop4all_indexer-2.8.0.dev3/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev5}/PKG-INFO +5 -2
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_dataservice.py +19 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_dataset.py +0 -1
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/setup.py +5 -2
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/configs.py +16 -0
- onestop4all_indexer-2.8.0.dev5/utils/embeddings.py +67 -0
- onestop4all_indexer-2.8.0.dev5/utils/harvest.py +201 -0
- onestop4all_indexer-2.8.0.dev5/utils/qdrant.py +97 -0
- onestop4all_indexer-2.8.0.dev3/utils/harvest.py +0 -123
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_organization.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_service.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_softwaresourcecode.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/cli.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/solr.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/util.py +0 -0
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev5
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Dist: click
|
|
9
9
|
Requires-Dist: sparqlwrapper~=2.0.0
|
|
10
|
-
Requires-Dist: pysolr
|
|
10
|
+
Requires-Dist: pysolr>=3.11.0
|
|
11
11
|
Requires-Dist: jproperties~=2.1.1
|
|
12
12
|
Requires-Dist: geomet~=1.1.0
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
|
+
Requires-Dist: sentence-transformers~=5.1.0
|
|
15
|
+
Requires-Dist: qdrant-client~=1.15.1
|
|
16
|
+
Requires-Dist: xformers
|
|
14
17
|
Dynamic: author
|
|
15
18
|
Dynamic: author-email
|
|
16
19
|
Dynamic: license-file
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev5
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Dist: click
|
|
9
9
|
Requires-Dist: sparqlwrapper~=2.0.0
|
|
10
|
-
Requires-Dist: pysolr
|
|
10
|
+
Requires-Dist: pysolr>=3.11.0
|
|
11
11
|
Requires-Dist: jproperties~=2.1.1
|
|
12
12
|
Requires-Dist: geomet~=1.1.0
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
|
+
Requires-Dist: sentence-transformers~=5.1.0
|
|
15
|
+
Requires-Dist: qdrant-client~=1.15.1
|
|
16
|
+
Requires-Dist: xformers
|
|
14
17
|
Dynamic: author
|
|
15
18
|
Dynamic: author-email
|
|
16
19
|
Dynamic: license-file
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import json
|
|
2
3
|
from .harvester_base import Harvester
|
|
3
4
|
from data_repositories.repository_theme import RepositoryTheme
|
|
4
5
|
from data_repositories import RepositoryPerson
|
|
@@ -68,6 +69,10 @@ class DataService_Harvester(Harvester):
|
|
|
68
69
|
iteration_start=0,
|
|
69
70
|
iteration_end=None,
|
|
70
71
|
page_size=10000,
|
|
72
|
+
load_from_file="/tmp/dataservices.json",
|
|
73
|
+
# load_from_file=None,
|
|
74
|
+
# save_to_file="/tmp/dataservices.json",
|
|
75
|
+
save_to_file=None,
|
|
71
76
|
**kw,
|
|
72
77
|
):
|
|
73
78
|
super().__init__(**kw)
|
|
@@ -76,6 +81,8 @@ class DataService_Harvester(Harvester):
|
|
|
76
81
|
self.solr_validator = SolrValidator()
|
|
77
82
|
self.solr_validation = solr_validation
|
|
78
83
|
self.removed_geometries = []
|
|
84
|
+
self.load_from_file = load_from_file
|
|
85
|
+
self.save_to_file = save_to_file
|
|
79
86
|
self.iteration_start = (
|
|
80
87
|
int(iteration_start) if iteration_start is not None else 0
|
|
81
88
|
)
|
|
@@ -92,6 +99,13 @@ iter_end={self.iteration_end}"""
|
|
|
92
99
|
)
|
|
93
100
|
|
|
94
101
|
def harvest(self):
|
|
102
|
+
|
|
103
|
+
if self.load_from_file is not None:
|
|
104
|
+
log.info(f"Loading DataServices from file {self.load_from_file}")
|
|
105
|
+
with open(self.load_from_file, "r", encoding="utf-8") as f:
|
|
106
|
+
services_list = json.load(f)
|
|
107
|
+
return services_list
|
|
108
|
+
|
|
95
109
|
limit = self.page_size
|
|
96
110
|
# convert to list of repo documents for indexing
|
|
97
111
|
services = {} # repos dict
|
|
@@ -129,6 +143,11 @@ iter_end={self.iteration_end}"""
|
|
|
129
143
|
service["mainTitle"] = service["mainTitle"].strip()
|
|
130
144
|
services_list.append(service)
|
|
131
145
|
|
|
146
|
+
if self.save_to_file is not None:
|
|
147
|
+
log.info(f"Saving DataServices to file {self.save_to_file}")
|
|
148
|
+
with open(self.save_to_file, "w", encoding="utf-8") as f:
|
|
149
|
+
json.dump(services_list, f, ensure_ascii=False, indent=2)
|
|
150
|
+
|
|
132
151
|
return services_list
|
|
133
152
|
|
|
134
153
|
def parse_response(self, hits, services):
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_dataset.py
RENAMED
|
@@ -302,7 +302,6 @@ class Dataset_Harvester(HarvesterCordra):
|
|
|
302
302
|
value=val,
|
|
303
303
|
)
|
|
304
304
|
if "downloadURL" in distribution:
|
|
305
|
-
print(distribution["downloadURL"])
|
|
306
305
|
for download_url in distribution["downloadURL"]:
|
|
307
306
|
val = self.get_string_from_jsonld(
|
|
308
307
|
download_url, subject
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="OneStop4All-Indexer",
|
|
5
|
-
version="2.8.0.
|
|
5
|
+
version="2.8.0.dev5",
|
|
6
6
|
description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
|
|
7
7
|
author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
|
|
8
8
|
author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
|
|
@@ -10,10 +10,13 @@ setup(
|
|
|
10
10
|
install_requires=[
|
|
11
11
|
"click",
|
|
12
12
|
"sparqlwrapper~= 2.0.0",
|
|
13
|
-
"pysolr
|
|
13
|
+
"pysolr>= 3.11.0",
|
|
14
14
|
"jproperties~= 2.1.1 ",
|
|
15
15
|
"geomet ~= 1.1.0",
|
|
16
16
|
"shapely ~= 2.0.5",
|
|
17
|
+
"sentence-transformers ~= 5.1.0",
|
|
18
|
+
"qdrant-client ~= 1.15.1",
|
|
19
|
+
"xformers",
|
|
17
20
|
],
|
|
18
21
|
include_package_data=True,
|
|
19
22
|
entry_points={
|
|
@@ -88,4 +88,20 @@ config = {
|
|
|
88
88
|
),
|
|
89
89
|
)
|
|
90
90
|
),
|
|
91
|
+
"qdrant_url": os.getenv(
|
|
92
|
+
"QDRANT_URL",
|
|
93
|
+
default=(
|
|
94
|
+
app_properties.get("qdrant_url").data
|
|
95
|
+
if app_properties.get("qdrant_url")
|
|
96
|
+
else None
|
|
97
|
+
),
|
|
98
|
+
),
|
|
99
|
+
"qdrant_api_key": os.getenv(
|
|
100
|
+
"QDRANT_API_KEY",
|
|
101
|
+
default=(
|
|
102
|
+
app_properties.get("qdrant_api_key").data
|
|
103
|
+
if app_properties.get("qdrant_api_key")
|
|
104
|
+
else None
|
|
105
|
+
),
|
|
106
|
+
),
|
|
91
107
|
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from sentence_transformers import SentenceTransformer
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
log = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
model_384 = SentenceTransformer(
|
|
7
|
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
8
|
+
)
|
|
9
|
+
# model_768 = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
|
|
10
|
+
# model_1024 = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def add_embedding(document):
|
|
14
|
+
if document["type"] == "person_nested" or "person_nested" in ",".join(
|
|
15
|
+
document
|
|
16
|
+
): # no embeddings for nested author documents
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
emb_str = get_entity_card(document)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
log.error(e)
|
|
23
|
+
log.error(document)
|
|
24
|
+
log.info(emb_str if emb_str.isprintable() else document["id"])
|
|
25
|
+
try:
|
|
26
|
+
if emb_str is not None:
|
|
27
|
+
document["embedding_384"] = model_384.encode(emb_str).tolist()
|
|
28
|
+
# document["embedding_768"] = model_768.encode(emb_str).tolist()
|
|
29
|
+
# document["embedding_1024"] = model_1024.encode(emb_str).tolist()
|
|
30
|
+
except Exception as e:
|
|
31
|
+
log.error(e)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_entity_card(document):
|
|
35
|
+
text = document["mainTitle"]
|
|
36
|
+
if "description" in document and len(document["description"]) > 0:
|
|
37
|
+
text += "\n" + "\n".join(document["description"])
|
|
38
|
+
if "keyword" in document and len(document["keyword"]) > 0:
|
|
39
|
+
text += "\n" + ",".join(document["keyword"])
|
|
40
|
+
elif "keywords" in document and len(document["keywords"]) > 0:
|
|
41
|
+
text += "\n" + ",".join(document["keywords"])
|
|
42
|
+
|
|
43
|
+
if (
|
|
44
|
+
"http://xmlns.com/foaf/0.1/Organization" in document["type"]
|
|
45
|
+
): # special attributes for organizations
|
|
46
|
+
if "altLabel" in document:
|
|
47
|
+
text += "\n" + ",".join(document["altLabel"])
|
|
48
|
+
if "countryName" in document:
|
|
49
|
+
text += "\n" + "country: " + ",".join(document["countryName"])
|
|
50
|
+
if "locality" in document:
|
|
51
|
+
text += "\n" + "location: " + ",".join(document["locality"])
|
|
52
|
+
|
|
53
|
+
return text
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_type_str(
|
|
57
|
+
document,
|
|
58
|
+
): # might need improvement with proper mapping (aligned with OS4A)
|
|
59
|
+
type_uri = document["type"]
|
|
60
|
+
if isinstance(type_uri, str):
|
|
61
|
+
type_val = type_uri.strip().lstrip("/").split("/")[-1]
|
|
62
|
+
elif isinstance(type_uri, list):
|
|
63
|
+
uris = []
|
|
64
|
+
for uri in type_uri:
|
|
65
|
+
uris.append(uri.strip().lstrip("/").split("/")[-1])
|
|
66
|
+
type_val = ",".join(uris)
|
|
67
|
+
return type_val
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import requests
|
|
3
|
+
import timeit
|
|
4
|
+
import traceback
|
|
5
|
+
|
|
6
|
+
from socket import gethostname
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from data_repositories import (
|
|
10
|
+
RepositoryTheme,
|
|
11
|
+
RepositoryPerson,
|
|
12
|
+
RepositoryResourceLinks,
|
|
13
|
+
RepositoryN4EOrganization,
|
|
14
|
+
)
|
|
15
|
+
from harvesters import (
|
|
16
|
+
Repository_Harvester,
|
|
17
|
+
Organization_Harvester,
|
|
18
|
+
Article_Harvester,
|
|
19
|
+
Softwaresourcecode_Harvester,
|
|
20
|
+
Learningresource_Harvester,
|
|
21
|
+
Metadatastandard_Harvester,
|
|
22
|
+
Document_Harvester,
|
|
23
|
+
Dataset_Harvester,
|
|
24
|
+
DataService_Harvester,
|
|
25
|
+
Service_Harvester,
|
|
26
|
+
)
|
|
27
|
+
from utils import config, Solr
|
|
28
|
+
from utils import embeddings
|
|
29
|
+
from utils.qdrant import Qdrant
|
|
30
|
+
|
|
31
|
+
log = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_harvester(
|
|
35
|
+
harvester_name=None,
|
|
36
|
+
dataset_options={},
|
|
37
|
+
dataservice_options={},
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
# Lazy initialization of repository singletons
|
|
41
|
+
# Only one instance per repository type will be created and reused
|
|
42
|
+
repo_singletons = {}
|
|
43
|
+
repo_classes = {
|
|
44
|
+
"themes_repo": RepositoryTheme,
|
|
45
|
+
"persons_repo": RepositoryPerson,
|
|
46
|
+
"links_repo": RepositoryResourceLinks,
|
|
47
|
+
"n4e_orgas_repo": RepositoryN4EOrganization,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def get_repo(repo_name):
|
|
51
|
+
# Returns a singleton instance of the requested repository.
|
|
52
|
+
# Instantiates the repository only on first access.
|
|
53
|
+
if repo_name not in repo_singletons:
|
|
54
|
+
repo_singletons[repo_name] = repo_classes[repo_name]()
|
|
55
|
+
return repo_singletons[repo_name]
|
|
56
|
+
|
|
57
|
+
# Mapping from harvester name to factory function
|
|
58
|
+
# Each factory uses get_repo to ensure repositories are only created once
|
|
59
|
+
harvester_factories = {
|
|
60
|
+
"Service": lambda: Service_Harvester(get_repo("n4e_orgas_repo")),
|
|
61
|
+
"DataService": lambda: DataService_Harvester(
|
|
62
|
+
get_repo("themes_repo"),
|
|
63
|
+
get_repo("persons_repo"),
|
|
64
|
+
**dataservice_options,
|
|
65
|
+
),
|
|
66
|
+
"Repository": lambda: Repository_Harvester(
|
|
67
|
+
get_repo("themes_repo"),
|
|
68
|
+
get_repo("n4e_orgas_repo"),
|
|
69
|
+
),
|
|
70
|
+
"Organization": lambda: Organization_Harvester(
|
|
71
|
+
get_repo("persons_repo")
|
|
72
|
+
),
|
|
73
|
+
"Article": lambda: Article_Harvester(
|
|
74
|
+
get_repo("persons_repo"),
|
|
75
|
+
get_repo("links_repo"),
|
|
76
|
+
get_repo("themes_repo"),
|
|
77
|
+
),
|
|
78
|
+
"Softwaresourcecode": lambda: Softwaresourcecode_Harvester(
|
|
79
|
+
get_repo("persons_repo"),
|
|
80
|
+
get_repo("links_repo"),
|
|
81
|
+
),
|
|
82
|
+
"Learningresource": lambda: Learningresource_Harvester(
|
|
83
|
+
get_repo("persons_repo"),
|
|
84
|
+
get_repo("themes_repo"),
|
|
85
|
+
),
|
|
86
|
+
"Metadatastandard": lambda: Metadatastandard_Harvester(
|
|
87
|
+
get_repo("themes_repo")
|
|
88
|
+
),
|
|
89
|
+
"Document": lambda: Document_Harvester(
|
|
90
|
+
get_repo("persons_repo"),
|
|
91
|
+
get_repo("links_repo"),
|
|
92
|
+
),
|
|
93
|
+
"Dataset": lambda: Dataset_Harvester(
|
|
94
|
+
get_repo("persons_repo"),
|
|
95
|
+
get_repo("links_repo"),
|
|
96
|
+
**dataset_options,
|
|
97
|
+
),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# If no harvester_name is given, instantiate all harvesters
|
|
101
|
+
if not harvester_name:
|
|
102
|
+
return [factory() for factory in harvester_factories.values()]
|
|
103
|
+
|
|
104
|
+
# If a name or list/tuple of names is given, instantiate only those
|
|
105
|
+
# This keeps compatibility with previous behavior
|
|
106
|
+
if isinstance(harvester_name, (list, tuple)):
|
|
107
|
+
names = harvester_name
|
|
108
|
+
else:
|
|
109
|
+
names = [harvester_name]
|
|
110
|
+
|
|
111
|
+
return [
|
|
112
|
+
harvester_factories[name]()
|
|
113
|
+
for name in harvester_factories
|
|
114
|
+
if name in names
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def run(
|
|
119
|
+
requested_harvester: Optional[tuple] = None,
|
|
120
|
+
reset_index: Optional[bool] = None,
|
|
121
|
+
**further_options,
|
|
122
|
+
):
|
|
123
|
+
solr = Solr()
|
|
124
|
+
qdrant = Qdrant() # vector db
|
|
125
|
+
|
|
126
|
+
if reset_index:
|
|
127
|
+
solr.reset_index()
|
|
128
|
+
|
|
129
|
+
stats = {}
|
|
130
|
+
|
|
131
|
+
i = 1
|
|
132
|
+
harvesters = get_harvester(
|
|
133
|
+
harvester_name=requested_harvester, **further_options
|
|
134
|
+
)
|
|
135
|
+
log.info(f"Initialized {len(harvesters)} harvesters.")
|
|
136
|
+
document_count = 0
|
|
137
|
+
try:
|
|
138
|
+
for harvester in harvesters:
|
|
139
|
+
log.info(f"start harvester {harvester} ({i} of {len(harvesters)}")
|
|
140
|
+
start_time = timeit.default_timer()
|
|
141
|
+
documents = harvester.harvest()
|
|
142
|
+
elapsed_time = timeit.default_timer() - start_time
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
generate_embeddings(documents)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
log.error(e)
|
|
148
|
+
|
|
149
|
+
log.info(f"index harvested {len(documents)} documents (vector db)")
|
|
150
|
+
qdrant.index_documents(documents)
|
|
151
|
+
log.info(
|
|
152
|
+
f"finished indexing for harvester {i} of {len(harvesters)}) (vector db)"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# remove embeddings from document, otherwise embeddings would be indexed in solr
|
|
156
|
+
# comment loop if embeddings should be indexed in solr as well.
|
|
157
|
+
for document in documents:
|
|
158
|
+
document.pop("embedding_384", None)
|
|
159
|
+
|
|
160
|
+
log.info(f"index harvested {len(documents)} documents")
|
|
161
|
+
solr.index_documents(documents)
|
|
162
|
+
document_count += len(documents)
|
|
163
|
+
log.info(
|
|
164
|
+
f"finished indexing for harvester {i} of {len(harvesters)})"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
stats[harvester.get_type()] = {
|
|
168
|
+
"document_count": len(documents),
|
|
169
|
+
"processing_time": round(elapsed_time, 4),
|
|
170
|
+
}
|
|
171
|
+
if harvester.get_notes():
|
|
172
|
+
stats[harvester.get_type()]["notes"] = harvester.get_notes()
|
|
173
|
+
i += 1
|
|
174
|
+
except Exception as e:
|
|
175
|
+
data = {
|
|
176
|
+
"exception": str(e),
|
|
177
|
+
"traceback": traceback.format_exc(),
|
|
178
|
+
"hostname": gethostname(),
|
|
179
|
+
}
|
|
180
|
+
if config["mailserver_url"]:
|
|
181
|
+
requests.post(config["mailserver_url"], json=data, verify=False)
|
|
182
|
+
else:
|
|
183
|
+
log.info(
|
|
184
|
+
"harvesting completed, indexed {} documents".format(document_count)
|
|
185
|
+
)
|
|
186
|
+
requests.post(
|
|
187
|
+
config["mailserver_url"],
|
|
188
|
+
json={"stats": stats, "hostname": gethostname()},
|
|
189
|
+
verify=False,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def generate_embeddings(documents):
|
|
194
|
+
log.info("create embeddings for " + str(len(documents)) + " documents")
|
|
195
|
+
for document in documents:
|
|
196
|
+
embeddings.add_embedding(document)
|
|
197
|
+
log.info("created embeddings for batch")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
if __name__ == "__main__":
|
|
201
|
+
run()
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
import uuid
|
|
4
|
+
|
|
5
|
+
from qdrant_client import QdrantClient, models
|
|
6
|
+
from typing import List, Dict
|
|
7
|
+
|
|
8
|
+
from utils import config
|
|
9
|
+
|
|
10
|
+
log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Qdrant:
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.endpoint = config["qdrant_url"]
|
|
17
|
+
self.client = QdrantClient(self.endpoint, api_key=config["qdrant_url"])
|
|
18
|
+
log.debug(f"Initialized Qdrant client with endpoint {self.endpoint}")
|
|
19
|
+
self.collections = [
|
|
20
|
+
{"dim": 384, "name": "embedding_384"}
|
|
21
|
+
# {"dim": 768, "name": "embedding_768"},
|
|
22
|
+
# {"dim": 1024, "name": "embedding_1024"}
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
self.init_collections()
|
|
26
|
+
|
|
27
|
+
def init_collections(self):
|
|
28
|
+
for collection in self.collections:
|
|
29
|
+
self.init_collection(collection["dim"], collection["name"])
|
|
30
|
+
|
|
31
|
+
def init_collection(self, vector_dim, collection_name):
|
|
32
|
+
collections = self.client.get_collections() # existing collections
|
|
33
|
+
collection_names = [col.name for col in collections.collections]
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
if (
|
|
37
|
+
collection_name not in collection_names
|
|
38
|
+
): # only create if not existing
|
|
39
|
+
self.client.create_collection(
|
|
40
|
+
collection_name=collection_name,
|
|
41
|
+
vectors_config=models.VectorParams(
|
|
42
|
+
size=vector_dim, distance=models.Distance.COSINE
|
|
43
|
+
),
|
|
44
|
+
)
|
|
45
|
+
self.client.create_payload_index(
|
|
46
|
+
collection_name=collection_name,
|
|
47
|
+
field_name="id",
|
|
48
|
+
field_schema="keyword",
|
|
49
|
+
)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
log.error(e)
|
|
52
|
+
|
|
53
|
+
def index_documents(self, documents):
|
|
54
|
+
for collection in self.collections:
|
|
55
|
+
self.index_documents_dim(
|
|
56
|
+
documents,
|
|
57
|
+
collection_name=collection["name"],
|
|
58
|
+
embedding_key=collection["name"],
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def index_documents_dim(
|
|
62
|
+
self, documents: List[Dict], collection_name, embedding_key
|
|
63
|
+
) -> None:
|
|
64
|
+
log.info(
|
|
65
|
+
f"start indexing {len(documents)} documents to {self.endpoint}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
embeddings = []
|
|
69
|
+
payloads = []
|
|
70
|
+
ids = []
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
for document in documents:
|
|
74
|
+
ids.append(
|
|
75
|
+
str(uuid.uuid5(uuid.NAMESPACE_DNS, document["id"]))
|
|
76
|
+
) # generates always the same uuid for the same document id
|
|
77
|
+
embeddings.append(document[str(embedding_key)])
|
|
78
|
+
payload = {
|
|
79
|
+
"id": document["id"],
|
|
80
|
+
"mainTitle": document["mainTitle"],
|
|
81
|
+
"type": document["type"],
|
|
82
|
+
"indexedAt": datetime.datetime.now(datetime.timezone.utc),
|
|
83
|
+
}
|
|
84
|
+
if "description" in document:
|
|
85
|
+
payload["description"] = document["description"]
|
|
86
|
+
if "keywords" in document:
|
|
87
|
+
payload["keyword"] = document["keyword"]
|
|
88
|
+
payloads.append(payload)
|
|
89
|
+
|
|
90
|
+
self.client.upload_collection(
|
|
91
|
+
collection_name=collection_name,
|
|
92
|
+
vectors=embeddings,
|
|
93
|
+
ids=ids,
|
|
94
|
+
payload=payloads,
|
|
95
|
+
)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
log.error(e)
|
|
@@ -1,123 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import requests
|
|
3
|
-
import timeit
|
|
4
|
-
import traceback
|
|
5
|
-
|
|
6
|
-
from socket import gethostname
|
|
7
|
-
from typing import Optional
|
|
8
|
-
|
|
9
|
-
from data_repositories import (
|
|
10
|
-
RepositoryTheme,
|
|
11
|
-
RepositoryPerson,
|
|
12
|
-
RepositoryResourceLinks,
|
|
13
|
-
RepositoryN4EOrganization,
|
|
14
|
-
)
|
|
15
|
-
from harvesters import (
|
|
16
|
-
Repository_Harvester,
|
|
17
|
-
Organization_Harvester,
|
|
18
|
-
Article_Harvester,
|
|
19
|
-
Softwaresourcecode_Harvester,
|
|
20
|
-
Learningresource_Harvester,
|
|
21
|
-
Metadatastandard_Harvester,
|
|
22
|
-
Document_Harvester,
|
|
23
|
-
Dataset_Harvester,
|
|
24
|
-
DataService_Harvester,
|
|
25
|
-
Service_Harvester,
|
|
26
|
-
)
|
|
27
|
-
from utils import config, Solr
|
|
28
|
-
|
|
29
|
-
log = logging.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def run(
|
|
33
|
-
requested_harvester: Optional[tuple] = None,
|
|
34
|
-
reset_index: Optional[bool] = None,
|
|
35
|
-
dataset_options: Optional[dict] = {},
|
|
36
|
-
dataservice_options: Optional[dict] = {},
|
|
37
|
-
):
|
|
38
|
-
# init in memory data repos
|
|
39
|
-
themes_repo = RepositoryTheme()
|
|
40
|
-
persons_repo = RepositoryPerson()
|
|
41
|
-
links_repo = RepositoryResourceLinks()
|
|
42
|
-
n4e_orgas_repo = RepositoryN4EOrganization()
|
|
43
|
-
|
|
44
|
-
solr = Solr()
|
|
45
|
-
|
|
46
|
-
if reset_index:
|
|
47
|
-
solr.reset_index()
|
|
48
|
-
|
|
49
|
-
stats = {}
|
|
50
|
-
|
|
51
|
-
_harvesters = {
|
|
52
|
-
"Service": Service_Harvester(n4e_organizations_repo=n4e_orgas_repo),
|
|
53
|
-
"DataService": DataService_Harvester(
|
|
54
|
-
themes_repo, persons_repo, **dataservice_options
|
|
55
|
-
),
|
|
56
|
-
"Repository": Repository_Harvester(
|
|
57
|
-
themes_repo, n4e_organizations_repo=n4e_orgas_repo
|
|
58
|
-
),
|
|
59
|
-
"Organization": Organization_Harvester(persons_repo),
|
|
60
|
-
"Article": Article_Harvester(persons_repo, links_repo, themes_repo),
|
|
61
|
-
"Softwaresourcecode": Softwaresourcecode_Harvester(
|
|
62
|
-
persons_repo, links_repo
|
|
63
|
-
),
|
|
64
|
-
"Learningresource": Learningresource_Harvester(
|
|
65
|
-
persons_repo, themes_repo
|
|
66
|
-
),
|
|
67
|
-
"Metadatastandard": Metadatastandard_Harvester(themes_repo),
|
|
68
|
-
"Document": Document_Harvester(persons_repo, links_repo),
|
|
69
|
-
"Dataset": Dataset_Harvester(
|
|
70
|
-
persons_repo, links_repo, **dataset_options
|
|
71
|
-
),
|
|
72
|
-
}
|
|
73
|
-
i = 1
|
|
74
|
-
document_count = 0
|
|
75
|
-
try:
|
|
76
|
-
for harvester_name in _harvesters:
|
|
77
|
-
if (
|
|
78
|
-
requested_harvester
|
|
79
|
-
and harvester_name not in requested_harvester
|
|
80
|
-
):
|
|
81
|
-
continue
|
|
82
|
-
log.info(
|
|
83
|
-
f"start harvester {harvester_name} ({i} of {len(_harvesters)}"
|
|
84
|
-
)
|
|
85
|
-
harvester = _harvesters[harvester_name]
|
|
86
|
-
start_time = timeit.default_timer()
|
|
87
|
-
documents = harvester.harvest()
|
|
88
|
-
elapsed_time = timeit.default_timer() - start_time
|
|
89
|
-
|
|
90
|
-
log.info(f"index harvested {len(documents)} documents")
|
|
91
|
-
solr.index_documents(documents)
|
|
92
|
-
document_count += len(documents)
|
|
93
|
-
log.info(
|
|
94
|
-
f"finished indexing for harvester {i} of {len(_harvesters)})"
|
|
95
|
-
)
|
|
96
|
-
stats[harvester.get_type()] = {
|
|
97
|
-
"document_count": len(documents),
|
|
98
|
-
"processing_time": round(elapsed_time, 4),
|
|
99
|
-
}
|
|
100
|
-
if harvester.get_notes():
|
|
101
|
-
stats[harvester.get_type()]["notes"] = harvester.get_notes()
|
|
102
|
-
i += 1
|
|
103
|
-
except Exception as e:
|
|
104
|
-
data = {
|
|
105
|
-
"exception": str(e),
|
|
106
|
-
"traceback": traceback.format_exc(),
|
|
107
|
-
"hostname": gethostname(),
|
|
108
|
-
}
|
|
109
|
-
if config["mailserver_url"]:
|
|
110
|
-
requests.post(config["mailserver_url"], json=data, verify=False)
|
|
111
|
-
else:
|
|
112
|
-
log.info(
|
|
113
|
-
"harvesting completed, indexed {} documents".format(document_count)
|
|
114
|
-
)
|
|
115
|
-
requests.post(
|
|
116
|
-
config["mailserver_url"],
|
|
117
|
-
json={"stats": stats, "hostname": gethostname()},
|
|
118
|
-
verify=False,
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
if __name__ == "__main__":
|
|
123
|
-
run()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_article.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_base.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_document.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_repository.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_service.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|