OneStop4All-Indexer 2.8.0.dev6__tar.gz → 2.8.0.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8/OneStop4All_Indexer.egg-info}/PKG-INFO +2 -5
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -2
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/OneStop4All_Indexer.egg-info/requires.txt +1 -4
- {onestop4all_indexer-2.8.0.dev6/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev8}/PKG-INFO +2 -5
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_dataservice.py +1 -17
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_dataset.py +1 -3
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_organization.py +8 -4
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/setup.py +2 -5
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/configs.py +0 -16
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/harvest.py +0 -27
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/solr.py +6 -3
- onestop4all_indexer-2.8.0.dev6/utils/embeddings.py +0 -67
- onestop4all_indexer-2.8.0.dev6/utils/qdrant.py +0 -97
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/LICENSE +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/repository_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/repository_n4eorganization.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/repository_person.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/repository_resource_links.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/repository_theme.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_article.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_base.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_document.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_learningresource.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_metadatastandards.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_repository.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_service.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_softwaresourcecode.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/pyproject.toml +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/setup.cfg +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/__init__.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/cli.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/sparql.py +0 -0
- {onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/utils/util.py +0 -0
|
@@ -1,19 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev8
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Dist: click
|
|
9
9
|
Requires-Dist: sparqlwrapper~=2.0.0
|
|
10
|
-
Requires-Dist: pysolr
|
|
10
|
+
Requires-Dist: pysolr~=3.11.0
|
|
11
11
|
Requires-Dist: jproperties~=2.1.1
|
|
12
12
|
Requires-Dist: geomet~=1.1.0
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
|
-
Requires-Dist: sentence-transformers~=5.1.0
|
|
15
|
-
Requires-Dist: qdrant-client~=1.15.1
|
|
16
|
-
Requires-Dist: xformers
|
|
17
14
|
Dynamic: author
|
|
18
15
|
Dynamic: author-email
|
|
19
16
|
Dynamic: license-file
|
|
@@ -1,19 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OneStop4All-Indexer
|
|
3
|
-
Version: 2.8.0.
|
|
3
|
+
Version: 2.8.0.dev8
|
|
4
4
|
Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
|
|
5
5
|
Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
|
|
6
6
|
Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Dist: click
|
|
9
9
|
Requires-Dist: sparqlwrapper~=2.0.0
|
|
10
|
-
Requires-Dist: pysolr
|
|
10
|
+
Requires-Dist: pysolr~=3.11.0
|
|
11
11
|
Requires-Dist: jproperties~=2.1.1
|
|
12
12
|
Requires-Dist: geomet~=1.1.0
|
|
13
13
|
Requires-Dist: shapely~=2.0.5
|
|
14
|
-
Requires-Dist: sentence-transformers~=5.1.0
|
|
15
|
-
Requires-Dist: qdrant-client~=1.15.1
|
|
16
|
-
Requires-Dist: xformers
|
|
17
14
|
Dynamic: author
|
|
18
15
|
Dynamic: author-email
|
|
19
16
|
Dynamic: license-file
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
2
|
from .harvester_base import Harvester
|
|
5
3
|
from data_repositories.repository_theme import RepositoryTheme
|
|
6
4
|
from data_repositories import RepositoryPerson
|
|
@@ -78,8 +76,6 @@ class DataService_Harvester(Harvester):
|
|
|
78
76
|
self.solr_validator = SolrValidator()
|
|
79
77
|
self.solr_validation = solr_validation
|
|
80
78
|
self.removed_geometries = []
|
|
81
|
-
self.load_from_file = os.getenv("DS_LOAD_FROM_PATH", None)
|
|
82
|
-
self.save_to_file = os.getenv("DS_SAVE_TO_PATH", None)
|
|
83
79
|
self.iteration_start = (
|
|
84
80
|
int(iteration_start) if iteration_start is not None else 0
|
|
85
81
|
)
|
|
@@ -96,13 +92,6 @@ iter_end={self.iteration_end}"""
|
|
|
96
92
|
)
|
|
97
93
|
|
|
98
94
|
def harvest(self):
|
|
99
|
-
|
|
100
|
-
if self.load_from_file is not None:
|
|
101
|
-
log.info(f"Loading DataServices from file {self.load_from_file}")
|
|
102
|
-
with open(self.load_from_file, "r", encoding="utf-8") as f:
|
|
103
|
-
services_list = json.load(f)
|
|
104
|
-
return services_list
|
|
105
|
-
|
|
106
95
|
limit = self.page_size
|
|
107
96
|
# convert to list of repo documents for indexing
|
|
108
97
|
services = {} # repos dict
|
|
@@ -111,7 +100,7 @@ iter_end={self.iteration_end}"""
|
|
|
111
100
|
hits = {}
|
|
112
101
|
# split sparql query
|
|
113
102
|
while True:
|
|
114
|
-
if self.iteration_end is not None and i
|
|
103
|
+
if self.iteration_end is not None and i > self.iteration_end:
|
|
115
104
|
break
|
|
116
105
|
|
|
117
106
|
query_splitted = self.sparql_query % (limit * i, limit)
|
|
@@ -140,11 +129,6 @@ iter_end={self.iteration_end}"""
|
|
|
140
129
|
service["mainTitle"] = service["mainTitle"].strip()
|
|
141
130
|
services_list.append(service)
|
|
142
131
|
|
|
143
|
-
if self.save_to_file is not None:
|
|
144
|
-
log.info(f"Saving DataServices to file {self.save_to_file}")
|
|
145
|
-
with open(self.save_to_file, "w", encoding="utf-8") as f:
|
|
146
|
-
json.dump(services_list, f, ensure_ascii=False, indent=2)
|
|
147
|
-
|
|
148
132
|
return services_list
|
|
149
133
|
|
|
150
134
|
def parse_response(self, hits, services):
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_dataset.py
RENAMED
|
@@ -24,9 +24,6 @@ class Dataset_Harvester(HarvesterCordra):
|
|
|
24
24
|
iteration_end=None,
|
|
25
25
|
page_size=50000,
|
|
26
26
|
query='type:"Dataset"',
|
|
27
|
-
# query='id:"n4e/dthb-oai-pangaea.de-doi-10.1594-PANGAEA.981078"', #downloadURL: https://cordra.knowledgehub.test.n4e.geo.tu-dresden.de/objects/n4e/dthb-oai-pangaea.de-doi-10.1594-PANGAEA.981078
|
|
28
|
-
# query='id:"n4e/dthb-GB_NERC_BAS_PDC_01994"', #accessURL: https://cordra.knowledgehub.test.n4e.geo.tu-dresden.de/objects/n4e/dthb-GB_NERC_BAS_PDC_01994
|
|
29
|
-
# query='id:"n4e/dthb-6A0D8B9D-1BBD-441B-BA5C-6159EE41EE71"', #multiple accessURLs: https://cordra.knowledgehub.nfdi4earth.de/objects/n4e/dthb-6A0D8B9D-1BBD-441B-BA5C-6159EE41EE71,
|
|
30
27
|
solr_validation=True,
|
|
31
28
|
**kw,
|
|
32
29
|
):
|
|
@@ -302,6 +299,7 @@ class Dataset_Harvester(HarvesterCordra):
|
|
|
302
299
|
value=val,
|
|
303
300
|
)
|
|
304
301
|
if "downloadURL" in distribution:
|
|
302
|
+
print(distribution["downloadURL"])
|
|
305
303
|
for download_url in distribution["downloadURL"]:
|
|
306
304
|
val = self.get_string_from_jsonld(
|
|
307
305
|
download_url, subject
|
|
@@ -149,11 +149,15 @@ class Organization_Harvester(Harvester):
|
|
|
149
149
|
|
|
150
150
|
# set geometry if available and not already set
|
|
151
151
|
if (
|
|
152
|
-
"
|
|
153
|
-
and
|
|
154
|
-
|
|
152
|
+
predicate == "http://www.opengis.net/ont/geosparql#hasGeometry"
|
|
153
|
+
and ("geometry" not in organizations[subject]
|
|
154
|
+
or hit["geo_as_wkt"]["value"] not in organizations[subject]["geometry"])
|
|
155
155
|
):
|
|
156
|
-
|
|
156
|
+
self.addValue(
|
|
157
|
+
dict=organizations[subject],
|
|
158
|
+
attribute="geometry",
|
|
159
|
+
value=hit["geo_as_wkt"]["value"]
|
|
160
|
+
)
|
|
157
161
|
|
|
158
162
|
# set membership in N4E project
|
|
159
163
|
is_n4e_member = hit.get("isN4EMember", {}).get("value", None)
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="OneStop4All-Indexer",
|
|
5
|
-
version="2.8.0.
|
|
5
|
+
version="2.8.0.dev8",
|
|
6
6
|
description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
|
|
7
7
|
author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
|
|
8
8
|
author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
|
|
@@ -10,13 +10,10 @@ setup(
|
|
|
10
10
|
install_requires=[
|
|
11
11
|
"click",
|
|
12
12
|
"sparqlwrapper~= 2.0.0",
|
|
13
|
-
"pysolr
|
|
13
|
+
"pysolr~= 3.11.0",
|
|
14
14
|
"jproperties~= 2.1.1 ",
|
|
15
15
|
"geomet ~= 1.1.0",
|
|
16
16
|
"shapely ~= 2.0.5",
|
|
17
|
-
"sentence-transformers ~= 5.1.0",
|
|
18
|
-
"qdrant-client ~= 1.15.1",
|
|
19
|
-
"xformers",
|
|
20
17
|
],
|
|
21
18
|
include_package_data=True,
|
|
22
19
|
entry_points={
|
|
@@ -88,20 +88,4 @@ config = {
|
|
|
88
88
|
),
|
|
89
89
|
)
|
|
90
90
|
),
|
|
91
|
-
"qdrant_url": os.getenv(
|
|
92
|
-
"QDRANT_URL",
|
|
93
|
-
default=(
|
|
94
|
-
app_properties.get("qdrant_url").data
|
|
95
|
-
if app_properties.get("qdrant_url")
|
|
96
|
-
else None
|
|
97
|
-
),
|
|
98
|
-
),
|
|
99
|
-
"qdrant_api_key": os.getenv(
|
|
100
|
-
"QDRANT_API_KEY",
|
|
101
|
-
default=(
|
|
102
|
-
app_properties.get("qdrant_api_key").data
|
|
103
|
-
if app_properties.get("qdrant_api_key")
|
|
104
|
-
else None
|
|
105
|
-
),
|
|
106
|
-
),
|
|
107
91
|
}
|
|
@@ -25,8 +25,6 @@ from harvesters import (
|
|
|
25
25
|
Service_Harvester,
|
|
26
26
|
)
|
|
27
27
|
from utils import config, Solr
|
|
28
|
-
from utils import embeddings
|
|
29
|
-
from utils.qdrant import Qdrant
|
|
30
28
|
|
|
31
29
|
log = logging.getLogger(__name__)
|
|
32
30
|
|
|
@@ -121,7 +119,6 @@ def run(
|
|
|
121
119
|
**further_options,
|
|
122
120
|
):
|
|
123
121
|
solr = Solr()
|
|
124
|
-
qdrant = Qdrant() # vector db
|
|
125
122
|
|
|
126
123
|
if reset_index:
|
|
127
124
|
solr.reset_index()
|
|
@@ -141,29 +138,12 @@ def run(
|
|
|
141
138
|
documents = harvester.harvest()
|
|
142
139
|
elapsed_time = timeit.default_timer() - start_time
|
|
143
140
|
|
|
144
|
-
try:
|
|
145
|
-
generate_embeddings(documents)
|
|
146
|
-
except Exception as e:
|
|
147
|
-
log.error(e)
|
|
148
|
-
|
|
149
|
-
log.info(f"index harvested {len(documents)} documents (vector db)")
|
|
150
|
-
qdrant.index_documents(documents)
|
|
151
|
-
log.info(
|
|
152
|
-
f"finished indexing for harvester {i} of {len(harvesters)}) (vector db)"
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
# remove embeddings from document, otherwise embeddings would be indexed in solr
|
|
156
|
-
# comment loop if embeddings should be indexed in solr as well.
|
|
157
|
-
for document in documents:
|
|
158
|
-
document.pop("embedding_384", None)
|
|
159
|
-
|
|
160
141
|
log.info(f"index harvested {len(documents)} documents")
|
|
161
142
|
solr.index_documents(documents)
|
|
162
143
|
document_count += len(documents)
|
|
163
144
|
log.info(
|
|
164
145
|
f"finished indexing for harvester {i} of {len(harvesters)})"
|
|
165
146
|
)
|
|
166
|
-
|
|
167
147
|
stats[harvester.get_type()] = {
|
|
168
148
|
"document_count": len(documents),
|
|
169
149
|
"processing_time": round(elapsed_time, 4),
|
|
@@ -190,12 +170,5 @@ def run(
|
|
|
190
170
|
)
|
|
191
171
|
|
|
192
172
|
|
|
193
|
-
def generate_embeddings(documents):
|
|
194
|
-
log.info("create embeddings for " + str(len(documents)) + " documents")
|
|
195
|
-
for document in documents:
|
|
196
|
-
embeddings.add_embedding(document)
|
|
197
|
-
log.info("created embeddings for batch")
|
|
198
|
-
|
|
199
|
-
|
|
200
173
|
if __name__ == "__main__":
|
|
201
174
|
run()
|
|
@@ -3,7 +3,6 @@ import logging
|
|
|
3
3
|
from pysolr import Solr as SolrBase
|
|
4
4
|
from requests.auth import HTTPBasicAuth
|
|
5
5
|
from typing import List, Dict, Optional, Literal
|
|
6
|
-
from urllib.parse import urljoin
|
|
7
6
|
|
|
8
7
|
from utils import config
|
|
9
8
|
|
|
@@ -33,9 +32,10 @@ class Solr(object):
|
|
|
33
32
|
def endpoint(self):
|
|
34
33
|
# using config-values (by default) OR
|
|
35
34
|
# overwrite with initially given values
|
|
35
|
+
# TODO: check if endpoint is reachable, if not raise error
|
|
36
36
|
solr_url = self.solr_url if self.solr_url else config["solr_url"]
|
|
37
37
|
solr_core = self.solr_core if self.solr_core else config["solr_core"]
|
|
38
|
-
return
|
|
38
|
+
return f"{solr_url.rstrip('/')}/{solr_core}"
|
|
39
39
|
|
|
40
40
|
@property
|
|
41
41
|
def authentication(self):
|
|
@@ -112,4 +112,7 @@ class SolrValidator(Solr):
|
|
|
112
112
|
return False
|
|
113
113
|
|
|
114
114
|
def close(self):
|
|
115
|
-
|
|
115
|
+
# technically not necessary to check existence of test document,
|
|
116
|
+
# but when urls are misconfigured, this will throw an error
|
|
117
|
+
if self.solr.search("id:geomValidationTest").hits:
|
|
118
|
+
self.solr.delete(q="id:geomValidationTest")
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
from sentence_transformers import SentenceTransformer
|
|
2
|
-
import logging
|
|
3
|
-
|
|
4
|
-
log = logging.getLogger(__name__)
|
|
5
|
-
|
|
6
|
-
model_384 = SentenceTransformer(
|
|
7
|
-
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
8
|
-
)
|
|
9
|
-
# model_768 = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
|
|
10
|
-
# model_1024 = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def add_embedding(document):
|
|
14
|
-
if document["type"] == "person_nested" or "person_nested" in ",".join(
|
|
15
|
-
document
|
|
16
|
-
): # no embeddings for nested author documents
|
|
17
|
-
return
|
|
18
|
-
|
|
19
|
-
try:
|
|
20
|
-
emb_str = get_entity_card(document)
|
|
21
|
-
except Exception as e:
|
|
22
|
-
log.error(e)
|
|
23
|
-
log.error(document)
|
|
24
|
-
log.info(emb_str if emb_str.isprintable() else document["id"])
|
|
25
|
-
try:
|
|
26
|
-
if emb_str is not None:
|
|
27
|
-
document["embedding_384"] = model_384.encode(emb_str).tolist()
|
|
28
|
-
# document["embedding_768"] = model_768.encode(emb_str).tolist()
|
|
29
|
-
# document["embedding_1024"] = model_1024.encode(emb_str).tolist()
|
|
30
|
-
except Exception as e:
|
|
31
|
-
log.error(e)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def get_entity_card(document):
|
|
35
|
-
text = document["mainTitle"]
|
|
36
|
-
if "description" in document and len(document["description"]) > 0:
|
|
37
|
-
text += "\n" + "\n".join(document["description"])
|
|
38
|
-
if "keyword" in document and len(document["keyword"]) > 0:
|
|
39
|
-
text += "\n" + ",".join(document["keyword"])
|
|
40
|
-
elif "keywords" in document and len(document["keywords"]) > 0:
|
|
41
|
-
text += "\n" + ",".join(document["keywords"])
|
|
42
|
-
|
|
43
|
-
if (
|
|
44
|
-
"http://xmlns.com/foaf/0.1/Organization" in document["type"]
|
|
45
|
-
): # special attributes for organizations
|
|
46
|
-
if "altLabel" in document:
|
|
47
|
-
text += "\n" + ",".join(document["altLabel"])
|
|
48
|
-
if "countryName" in document:
|
|
49
|
-
text += "\n" + "country: " + ",".join(document["countryName"])
|
|
50
|
-
if "locality" in document:
|
|
51
|
-
text += "\n" + "location: " + ",".join(document["locality"])
|
|
52
|
-
|
|
53
|
-
return text
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def get_type_str(
|
|
57
|
-
document,
|
|
58
|
-
): # might need improvement with proper mapping (aligned with OS4A)
|
|
59
|
-
type_uri = document["type"]
|
|
60
|
-
if isinstance(type_uri, str):
|
|
61
|
-
type_val = type_uri.strip().lstrip("/").split("/")[-1]
|
|
62
|
-
elif isinstance(type_uri, list):
|
|
63
|
-
uris = []
|
|
64
|
-
for uri in type_uri:
|
|
65
|
-
uris.append(uri.strip().lstrip("/").split("/")[-1])
|
|
66
|
-
type_val = ",".join(uris)
|
|
67
|
-
return type_val
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
import logging
|
|
3
|
-
import uuid
|
|
4
|
-
|
|
5
|
-
from qdrant_client import QdrantClient, models
|
|
6
|
-
from typing import List, Dict
|
|
7
|
-
|
|
8
|
-
from utils import config
|
|
9
|
-
|
|
10
|
-
log = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class Qdrant:
|
|
14
|
-
|
|
15
|
-
def __init__(self):
|
|
16
|
-
self.endpoint = config["qdrant_url"]
|
|
17
|
-
self.client = QdrantClient(self.endpoint, api_key=config["qdrant_url"])
|
|
18
|
-
log.debug(f"Initialized Qdrant client with endpoint {self.endpoint}")
|
|
19
|
-
self.collections = [
|
|
20
|
-
{"dim": 384, "name": "embedding_384"}
|
|
21
|
-
# {"dim": 768, "name": "embedding_768"},
|
|
22
|
-
# {"dim": 1024, "name": "embedding_1024"}
|
|
23
|
-
]
|
|
24
|
-
|
|
25
|
-
self.init_collections()
|
|
26
|
-
|
|
27
|
-
def init_collections(self):
|
|
28
|
-
for collection in self.collections:
|
|
29
|
-
self.init_collection(collection["dim"], collection["name"])
|
|
30
|
-
|
|
31
|
-
def init_collection(self, vector_dim, collection_name):
|
|
32
|
-
collections = self.client.get_collections() # existing collections
|
|
33
|
-
collection_names = [col.name for col in collections.collections]
|
|
34
|
-
|
|
35
|
-
try:
|
|
36
|
-
if (
|
|
37
|
-
collection_name not in collection_names
|
|
38
|
-
): # only create if not existing
|
|
39
|
-
self.client.create_collection(
|
|
40
|
-
collection_name=collection_name,
|
|
41
|
-
vectors_config=models.VectorParams(
|
|
42
|
-
size=vector_dim, distance=models.Distance.COSINE
|
|
43
|
-
),
|
|
44
|
-
)
|
|
45
|
-
self.client.create_payload_index(
|
|
46
|
-
collection_name=collection_name,
|
|
47
|
-
field_name="id",
|
|
48
|
-
field_schema="keyword",
|
|
49
|
-
)
|
|
50
|
-
except Exception as e:
|
|
51
|
-
log.error(e)
|
|
52
|
-
|
|
53
|
-
def index_documents(self, documents):
|
|
54
|
-
for collection in self.collections:
|
|
55
|
-
self.index_documents_dim(
|
|
56
|
-
documents,
|
|
57
|
-
collection_name=collection["name"],
|
|
58
|
-
embedding_key=collection["name"],
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
def index_documents_dim(
|
|
62
|
-
self, documents: List[Dict], collection_name, embedding_key
|
|
63
|
-
) -> None:
|
|
64
|
-
log.info(
|
|
65
|
-
f"start indexing {len(documents)} documents to {self.endpoint}"
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
embeddings = []
|
|
69
|
-
payloads = []
|
|
70
|
-
ids = []
|
|
71
|
-
|
|
72
|
-
try:
|
|
73
|
-
for document in documents:
|
|
74
|
-
ids.append(
|
|
75
|
-
str(uuid.uuid5(uuid.NAMESPACE_DNS, document["id"]))
|
|
76
|
-
) # generates always the same uuid for the same document id
|
|
77
|
-
embeddings.append(document[str(embedding_key)])
|
|
78
|
-
payload = {
|
|
79
|
-
"id": document["id"],
|
|
80
|
-
"mainTitle": document["mainTitle"],
|
|
81
|
-
"type": document["type"],
|
|
82
|
-
"indexedAt": datetime.datetime.now(datetime.timezone.utc),
|
|
83
|
-
}
|
|
84
|
-
if "description" in document:
|
|
85
|
-
payload["description"] = document["description"]
|
|
86
|
-
if "keywords" in document:
|
|
87
|
-
payload["keyword"] = document["keyword"]
|
|
88
|
-
payloads.append(payload)
|
|
89
|
-
|
|
90
|
-
self.client.upload_collection(
|
|
91
|
-
collection_name=collection_name,
|
|
92
|
-
vectors=embeddings,
|
|
93
|
-
ids=ids,
|
|
94
|
-
payload=payloads,
|
|
95
|
-
)
|
|
96
|
-
except Exception as e:
|
|
97
|
-
log.error(e)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/data_repositories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_article.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_base.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_document.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_repository.py
RENAMED
|
File without changes
|
{onestop4all_indexer-2.8.0.dev6 → onestop4all_indexer-2.8.0.dev8}/harvesters/harvester_service.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|