OneStop4All-Indexer 2.8.0.dev3__tar.gz → 2.8.0.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5/OneStop4All_Indexer.egg-info}/PKG-INFO +5 -2
  2. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/SOURCES.txt +2 -0
  3. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/requires.txt +4 -1
  4. {onestop4all_indexer-2.8.0.dev3/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev5}/PKG-INFO +5 -2
  5. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_dataservice.py +19 -0
  6. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_dataset.py +0 -1
  7. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/setup.py +5 -2
  8. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/configs.py +16 -0
  9. onestop4all_indexer-2.8.0.dev5/utils/embeddings.py +67 -0
  10. onestop4all_indexer-2.8.0.dev5/utils/harvest.py +201 -0
  11. onestop4all_indexer-2.8.0.dev5/utils/qdrant.py +97 -0
  12. onestop4all_indexer-2.8.0.dev3/utils/harvest.py +0 -123
  13. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/LICENSE +0 -0
  14. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
  15. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
  16. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
  17. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/__init__.py +0 -0
  18. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_base.py +0 -0
  19. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_n4eorganization.py +0 -0
  20. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_person.py +0 -0
  21. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_resource_links.py +0 -0
  22. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/data_repositories/repository_theme.py +0 -0
  23. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/__init__.py +0 -0
  24. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_article.py +0 -0
  25. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_base.py +0 -0
  26. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_document.py +0 -0
  27. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_learningresource.py +0 -0
  28. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_metadatastandards.py +0 -0
  29. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_organization.py +0 -0
  30. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_repository.py +0 -0
  31. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_service.py +0 -0
  32. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/harvesters/harvester_softwaresourcecode.py +0 -0
  33. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/pyproject.toml +0 -0
  34. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/setup.cfg +0 -0
  35. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/__init__.py +0 -0
  36. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/cli.py +0 -0
  37. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/solr.py +0 -0
  38. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/sparql.py +0 -0
  39. {onestop4all_indexer-2.8.0.dev3 → onestop4all_indexer-2.8.0.dev5}/utils/util.py +0 -0
@@ -1,16 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev3
3
+ Version: 2.8.0.dev5
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
7
7
  License-File: LICENSE
8
8
  Requires-Dist: click
9
9
  Requires-Dist: sparqlwrapper~=2.0.0
10
- Requires-Dist: pysolr~=3.9.0
10
+ Requires-Dist: pysolr>=3.11.0
11
11
  Requires-Dist: jproperties~=2.1.1
12
12
  Requires-Dist: geomet~=1.1.0
13
13
  Requires-Dist: shapely~=2.0.5
14
+ Requires-Dist: sentence-transformers~=5.1.0
15
+ Requires-Dist: qdrant-client~=1.15.1
16
+ Requires-Dist: xformers
14
17
  Dynamic: author
15
18
  Dynamic: author-email
16
19
  Dynamic: license-file
@@ -28,7 +28,9 @@ harvesters/harvester_softwaresourcecode.py
28
28
  utils/__init__.py
29
29
  utils/cli.py
30
30
  utils/configs.py
31
+ utils/embeddings.py
31
32
  utils/harvest.py
33
+ utils/qdrant.py
32
34
  utils/solr.py
33
35
  utils/sparql.py
34
36
  utils/util.py
@@ -1,6 +1,9 @@
1
1
  click
2
2
  sparqlwrapper~=2.0.0
3
- pysolr~=3.9.0
3
+ pysolr>=3.11.0
4
4
  jproperties~=2.1.1
5
5
  geomet~=1.1.0
6
6
  shapely~=2.0.5
7
+ sentence-transformers~=5.1.0
8
+ qdrant-client~=1.15.1
9
+ xformers
@@ -1,16 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev3
3
+ Version: 2.8.0.dev5
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
7
7
  License-File: LICENSE
8
8
  Requires-Dist: click
9
9
  Requires-Dist: sparqlwrapper~=2.0.0
10
- Requires-Dist: pysolr~=3.9.0
10
+ Requires-Dist: pysolr>=3.11.0
11
11
  Requires-Dist: jproperties~=2.1.1
12
12
  Requires-Dist: geomet~=1.1.0
13
13
  Requires-Dist: shapely~=2.0.5
14
+ Requires-Dist: sentence-transformers~=5.1.0
15
+ Requires-Dist: qdrant-client~=1.15.1
16
+ Requires-Dist: xformers
14
17
  Dynamic: author
15
18
  Dynamic: author-email
16
19
  Dynamic: license-file
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import json
2
3
  from .harvester_base import Harvester
3
4
  from data_repositories.repository_theme import RepositoryTheme
4
5
  from data_repositories import RepositoryPerson
@@ -68,6 +69,10 @@ class DataService_Harvester(Harvester):
68
69
  iteration_start=0,
69
70
  iteration_end=None,
70
71
  page_size=10000,
72
+ load_from_file="/tmp/dataservices.json",
73
+ # load_from_file=None,
74
+ # save_to_file="/tmp/dataservices.json",
75
+ save_to_file=None,
71
76
  **kw,
72
77
  ):
73
78
  super().__init__(**kw)
@@ -76,6 +81,8 @@ class DataService_Harvester(Harvester):
76
81
  self.solr_validator = SolrValidator()
77
82
  self.solr_validation = solr_validation
78
83
  self.removed_geometries = []
84
+ self.load_from_file = load_from_file
85
+ self.save_to_file = save_to_file
79
86
  self.iteration_start = (
80
87
  int(iteration_start) if iteration_start is not None else 0
81
88
  )
@@ -92,6 +99,13 @@ iter_end={self.iteration_end}"""
92
99
  )
93
100
 
94
101
  def harvest(self):
102
+
103
+ if self.load_from_file is not None:
104
+ log.info(f"Loading DataServices from file {self.load_from_file}")
105
+ with open(self.load_from_file, "r", encoding="utf-8") as f:
106
+ services_list = json.load(f)
107
+ return services_list
108
+
95
109
  limit = self.page_size
96
110
  # convert to list of repo documents for indexing
97
111
  services = {} # repos dict
@@ -129,6 +143,11 @@ iter_end={self.iteration_end}"""
129
143
  service["mainTitle"] = service["mainTitle"].strip()
130
144
  services_list.append(service)
131
145
 
146
+ if self.save_to_file is not None:
147
+ log.info(f"Saving DataServices to file {self.save_to_file}")
148
+ with open(self.save_to_file, "w", encoding="utf-8") as f:
149
+ json.dump(services_list, f, ensure_ascii=False, indent=2)
150
+
132
151
  return services_list
133
152
 
134
153
  def parse_response(self, hits, services):
@@ -302,7 +302,6 @@ class Dataset_Harvester(HarvesterCordra):
302
302
  value=val,
303
303
  )
304
304
  if "downloadURL" in distribution:
305
- print(distribution["downloadURL"])
306
305
  for download_url in distribution["downloadURL"]:
307
306
  val = self.get_string_from_jsonld(
308
307
  download_url, subject
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="OneStop4All-Indexer",
5
- version="2.8.0.dev3",
5
+ version="2.8.0.dev5",
6
6
  description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
7
7
  author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
8
8
  author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
@@ -10,10 +10,13 @@ setup(
10
10
  install_requires=[
11
11
  "click",
12
12
  "sparqlwrapper~= 2.0.0",
13
- "pysolr~= 3.9.0",
13
+ "pysolr>= 3.11.0",
14
14
  "jproperties~= 2.1.1 ",
15
15
  "geomet ~= 1.1.0",
16
16
  "shapely ~= 2.0.5",
17
+ "sentence-transformers ~= 5.1.0",
18
+ "qdrant-client ~= 1.15.1",
19
+ "xformers",
17
20
  ],
18
21
  include_package_data=True,
19
22
  entry_points={
@@ -88,4 +88,20 @@ config = {
88
88
  ),
89
89
  )
90
90
  ),
91
+ "qdrant_url": os.getenv(
92
+ "QDRANT_URL",
93
+ default=(
94
+ app_properties.get("qdrant_url").data
95
+ if app_properties.get("qdrant_url")
96
+ else None
97
+ ),
98
+ ),
99
+ "qdrant_api_key": os.getenv(
100
+ "QDRANT_API_KEY",
101
+ default=(
102
+ app_properties.get("qdrant_api_key").data
103
+ if app_properties.get("qdrant_api_key")
104
+ else None
105
+ ),
106
+ ),
91
107
  }
@@ -0,0 +1,67 @@
1
+ from sentence_transformers import SentenceTransformer
2
+ import logging
3
+
4
+ log = logging.getLogger(__name__)
5
+
6
+ model_384 = SentenceTransformer(
7
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
8
+ )
9
+ # model_768 = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
10
+ # model_1024 = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
11
+
12
+
13
+ def add_embedding(document):
14
+ if document["type"] == "person_nested" or "person_nested" in ",".join(
15
+ document
16
+ ): # no embeddings for nested author documents
17
+ return
18
+
19
+ try:
20
+ emb_str = get_entity_card(document)
21
+ except Exception as e:
22
+ log.error(e)
23
+ log.error(document)
24
+ log.info(emb_str if emb_str.isprintable() else document["id"])
25
+ try:
26
+ if emb_str is not None:
27
+ document["embedding_384"] = model_384.encode(emb_str).tolist()
28
+ # document["embedding_768"] = model_768.encode(emb_str).tolist()
29
+ # document["embedding_1024"] = model_1024.encode(emb_str).tolist()
30
+ except Exception as e:
31
+ log.error(e)
32
+
33
+
34
+ def get_entity_card(document):
35
+ text = document["mainTitle"]
36
+ if "description" in document and len(document["description"]) > 0:
37
+ text += "\n" + "\n".join(document["description"])
38
+ if "keyword" in document and len(document["keyword"]) > 0:
39
+ text += "\n" + ",".join(document["keyword"])
40
+ elif "keywords" in document and len(document["keywords"]) > 0:
41
+ text += "\n" + ",".join(document["keywords"])
42
+
43
+ if (
44
+ "http://xmlns.com/foaf/0.1/Organization" in document["type"]
45
+ ): # special attributes for organizations
46
+ if "altLabel" in document:
47
+ text += "\n" + ",".join(document["altLabel"])
48
+ if "countryName" in document:
49
+ text += "\n" + "country: " + ",".join(document["countryName"])
50
+ if "locality" in document:
51
+ text += "\n" + "location: " + ",".join(document["locality"])
52
+
53
+ return text
54
+
55
+
56
+ def get_type_str(
57
+ document,
58
+ ): # might need improvement with proper mapping (aligned with OS4A)
59
+ type_uri = document["type"]
60
+ if isinstance(type_uri, str):
61
+ type_val = type_uri.strip().lstrip("/").split("/")[-1]
62
+ elif isinstance(type_uri, list):
63
+ uris = []
64
+ for uri in type_uri:
65
+ uris.append(uri.strip().lstrip("/").split("/")[-1])
66
+ type_val = ",".join(uris)
67
+ return type_val
@@ -0,0 +1,201 @@
1
+ import logging
2
+ import requests
3
+ import timeit
4
+ import traceback
5
+
6
+ from socket import gethostname
7
+ from typing import Optional
8
+
9
+ from data_repositories import (
10
+ RepositoryTheme,
11
+ RepositoryPerson,
12
+ RepositoryResourceLinks,
13
+ RepositoryN4EOrganization,
14
+ )
15
+ from harvesters import (
16
+ Repository_Harvester,
17
+ Organization_Harvester,
18
+ Article_Harvester,
19
+ Softwaresourcecode_Harvester,
20
+ Learningresource_Harvester,
21
+ Metadatastandard_Harvester,
22
+ Document_Harvester,
23
+ Dataset_Harvester,
24
+ DataService_Harvester,
25
+ Service_Harvester,
26
+ )
27
+ from utils import config, Solr
28
+ from utils import embeddings
29
+ from utils.qdrant import Qdrant
30
+
31
+ log = logging.getLogger(__name__)
32
+
33
+
34
+ def get_harvester(
35
+ harvester_name=None,
36
+ dataset_options={},
37
+ dataservice_options={},
38
+ ):
39
+
40
+ # Lazy initialization of repository singletons
41
+ # Only one instance per repository type will be created and reused
42
+ repo_singletons = {}
43
+ repo_classes = {
44
+ "themes_repo": RepositoryTheme,
45
+ "persons_repo": RepositoryPerson,
46
+ "links_repo": RepositoryResourceLinks,
47
+ "n4e_orgas_repo": RepositoryN4EOrganization,
48
+ }
49
+
50
+ def get_repo(repo_name):
51
+ # Returns a singleton instance of the requested repository.
52
+ # Instantiates the repository only on first access.
53
+ if repo_name not in repo_singletons:
54
+ repo_singletons[repo_name] = repo_classes[repo_name]()
55
+ return repo_singletons[repo_name]
56
+
57
+ # Mapping from harvester name to factory function
58
+ # Each factory uses get_repo to ensure repositories are only created once
59
+ harvester_factories = {
60
+ "Service": lambda: Service_Harvester(get_repo("n4e_orgas_repo")),
61
+ "DataService": lambda: DataService_Harvester(
62
+ get_repo("themes_repo"),
63
+ get_repo("persons_repo"),
64
+ **dataservice_options,
65
+ ),
66
+ "Repository": lambda: Repository_Harvester(
67
+ get_repo("themes_repo"),
68
+ get_repo("n4e_orgas_repo"),
69
+ ),
70
+ "Organization": lambda: Organization_Harvester(
71
+ get_repo("persons_repo")
72
+ ),
73
+ "Article": lambda: Article_Harvester(
74
+ get_repo("persons_repo"),
75
+ get_repo("links_repo"),
76
+ get_repo("themes_repo"),
77
+ ),
78
+ "Softwaresourcecode": lambda: Softwaresourcecode_Harvester(
79
+ get_repo("persons_repo"),
80
+ get_repo("links_repo"),
81
+ ),
82
+ "Learningresource": lambda: Learningresource_Harvester(
83
+ get_repo("persons_repo"),
84
+ get_repo("themes_repo"),
85
+ ),
86
+ "Metadatastandard": lambda: Metadatastandard_Harvester(
87
+ get_repo("themes_repo")
88
+ ),
89
+ "Document": lambda: Document_Harvester(
90
+ get_repo("persons_repo"),
91
+ get_repo("links_repo"),
92
+ ),
93
+ "Dataset": lambda: Dataset_Harvester(
94
+ get_repo("persons_repo"),
95
+ get_repo("links_repo"),
96
+ **dataset_options,
97
+ ),
98
+ }
99
+
100
+ # If no harvester_name is given, instantiate all harvesters
101
+ if not harvester_name:
102
+ return [factory() for factory in harvester_factories.values()]
103
+
104
+ # If a name or list/tuple of names is given, instantiate only those
105
+ # This keeps compatibility with previous behavior
106
+ if isinstance(harvester_name, (list, tuple)):
107
+ names = harvester_name
108
+ else:
109
+ names = [harvester_name]
110
+
111
+ return [
112
+ harvester_factories[name]()
113
+ for name in harvester_factories
114
+ if name in names
115
+ ]
116
+
117
+
118
+ def run(
119
+ requested_harvester: Optional[tuple] = None,
120
+ reset_index: Optional[bool] = None,
121
+ **further_options,
122
+ ):
123
+ solr = Solr()
124
+ qdrant = Qdrant() # vector db
125
+
126
+ if reset_index:
127
+ solr.reset_index()
128
+
129
+ stats = {}
130
+
131
+ i = 1
132
+ harvesters = get_harvester(
133
+ harvester_name=requested_harvester, **further_options
134
+ )
135
+ log.info(f"Initialized {len(harvesters)} harvesters.")
136
+ document_count = 0
137
+ try:
138
+ for harvester in harvesters:
139
+ log.info(f"start harvester {harvester} ({i} of {len(harvesters)}")
140
+ start_time = timeit.default_timer()
141
+ documents = harvester.harvest()
142
+ elapsed_time = timeit.default_timer() - start_time
143
+
144
+ try:
145
+ generate_embeddings(documents)
146
+ except Exception as e:
147
+ log.error(e)
148
+
149
+ log.info(f"index harvested {len(documents)} documents (vector db)")
150
+ qdrant.index_documents(documents)
151
+ log.info(
152
+ f"finished indexing for harvester {i} of {len(harvesters)}) (vector db)"
153
+ )
154
+
155
+ # remove embeddings from document, otherwise embeddings would be indexed in solr
156
+ # comment loop if embeddings should be indexed in solr as well.
157
+ for document in documents:
158
+ document.pop("embedding_384", None)
159
+
160
+ log.info(f"index harvested {len(documents)} documents")
161
+ solr.index_documents(documents)
162
+ document_count += len(documents)
163
+ log.info(
164
+ f"finished indexing for harvester {i} of {len(harvesters)})"
165
+ )
166
+
167
+ stats[harvester.get_type()] = {
168
+ "document_count": len(documents),
169
+ "processing_time": round(elapsed_time, 4),
170
+ }
171
+ if harvester.get_notes():
172
+ stats[harvester.get_type()]["notes"] = harvester.get_notes()
173
+ i += 1
174
+ except Exception as e:
175
+ data = {
176
+ "exception": str(e),
177
+ "traceback": traceback.format_exc(),
178
+ "hostname": gethostname(),
179
+ }
180
+ if config["mailserver_url"]:
181
+ requests.post(config["mailserver_url"], json=data, verify=False)
182
+ else:
183
+ log.info(
184
+ "harvesting completed, indexed {} documents".format(document_count)
185
+ )
186
+ requests.post(
187
+ config["mailserver_url"],
188
+ json={"stats": stats, "hostname": gethostname()},
189
+ verify=False,
190
+ )
191
+
192
+
193
+ def generate_embeddings(documents):
194
+ log.info("create embeddings for " + str(len(documents)) + " documents")
195
+ for document in documents:
196
+ embeddings.add_embedding(document)
197
+ log.info("created embeddings for batch")
198
+
199
+
200
+ if __name__ == "__main__":
201
+ run()
@@ -0,0 +1,97 @@
1
+ import datetime
2
+ import logging
3
+ import uuid
4
+
5
+ from qdrant_client import QdrantClient, models
6
+ from typing import List, Dict
7
+
8
+ from utils import config
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ class Qdrant:
14
+
15
+ def __init__(self):
16
+ self.endpoint = config["qdrant_url"]
17
+ self.client = QdrantClient(self.endpoint, api_key=config["qdrant_url"])
18
+ log.debug(f"Initialized Qdrant client with endpoint {self.endpoint}")
19
+ self.collections = [
20
+ {"dim": 384, "name": "embedding_384"}
21
+ # {"dim": 768, "name": "embedding_768"},
22
+ # {"dim": 1024, "name": "embedding_1024"}
23
+ ]
24
+
25
+ self.init_collections()
26
+
27
+ def init_collections(self):
28
+ for collection in self.collections:
29
+ self.init_collection(collection["dim"], collection["name"])
30
+
31
+ def init_collection(self, vector_dim, collection_name):
32
+ collections = self.client.get_collections() # existing collections
33
+ collection_names = [col.name for col in collections.collections]
34
+
35
+ try:
36
+ if (
37
+ collection_name not in collection_names
38
+ ): # only create if not existing
39
+ self.client.create_collection(
40
+ collection_name=collection_name,
41
+ vectors_config=models.VectorParams(
42
+ size=vector_dim, distance=models.Distance.COSINE
43
+ ),
44
+ )
45
+ self.client.create_payload_index(
46
+ collection_name=collection_name,
47
+ field_name="id",
48
+ field_schema="keyword",
49
+ )
50
+ except Exception as e:
51
+ log.error(e)
52
+
53
+ def index_documents(self, documents):
54
+ for collection in self.collections:
55
+ self.index_documents_dim(
56
+ documents,
57
+ collection_name=collection["name"],
58
+ embedding_key=collection["name"],
59
+ )
60
+
61
+ def index_documents_dim(
62
+ self, documents: List[Dict], collection_name, embedding_key
63
+ ) -> None:
64
+ log.info(
65
+ f"start indexing {len(documents)} documents to {self.endpoint}"
66
+ )
67
+
68
+ embeddings = []
69
+ payloads = []
70
+ ids = []
71
+
72
+ try:
73
+ for document in documents:
74
+ ids.append(
75
+ str(uuid.uuid5(uuid.NAMESPACE_DNS, document["id"]))
76
+ ) # generates always the same uuid for the same document id
77
+ embeddings.append(document[str(embedding_key)])
78
+ payload = {
79
+ "id": document["id"],
80
+ "mainTitle": document["mainTitle"],
81
+ "type": document["type"],
82
+ "indexedAt": datetime.datetime.now(datetime.timezone.utc),
83
+ }
84
+ if "description" in document:
85
+ payload["description"] = document["description"]
86
+ if "keywords" in document:
87
+ payload["keyword"] = document["keyword"]
88
+ payloads.append(payload)
89
+
90
+ self.client.upload_collection(
91
+ collection_name=collection_name,
92
+ vectors=embeddings,
93
+ ids=ids,
94
+ payload=payloads,
95
+ )
96
+ except Exception as e:
97
+ log.error(e)
@@ -1,123 +0,0 @@
1
- import logging
2
- import requests
3
- import timeit
4
- import traceback
5
-
6
- from socket import gethostname
7
- from typing import Optional
8
-
9
- from data_repositories import (
10
- RepositoryTheme,
11
- RepositoryPerson,
12
- RepositoryResourceLinks,
13
- RepositoryN4EOrganization,
14
- )
15
- from harvesters import (
16
- Repository_Harvester,
17
- Organization_Harvester,
18
- Article_Harvester,
19
- Softwaresourcecode_Harvester,
20
- Learningresource_Harvester,
21
- Metadatastandard_Harvester,
22
- Document_Harvester,
23
- Dataset_Harvester,
24
- DataService_Harvester,
25
- Service_Harvester,
26
- )
27
- from utils import config, Solr
28
-
29
- log = logging.getLogger(__name__)
30
-
31
-
32
- def run(
33
- requested_harvester: Optional[tuple] = None,
34
- reset_index: Optional[bool] = None,
35
- dataset_options: Optional[dict] = {},
36
- dataservice_options: Optional[dict] = {},
37
- ):
38
- # init in memory data repos
39
- themes_repo = RepositoryTheme()
40
- persons_repo = RepositoryPerson()
41
- links_repo = RepositoryResourceLinks()
42
- n4e_orgas_repo = RepositoryN4EOrganization()
43
-
44
- solr = Solr()
45
-
46
- if reset_index:
47
- solr.reset_index()
48
-
49
- stats = {}
50
-
51
- _harvesters = {
52
- "Service": Service_Harvester(n4e_organizations_repo=n4e_orgas_repo),
53
- "DataService": DataService_Harvester(
54
- themes_repo, persons_repo, **dataservice_options
55
- ),
56
- "Repository": Repository_Harvester(
57
- themes_repo, n4e_organizations_repo=n4e_orgas_repo
58
- ),
59
- "Organization": Organization_Harvester(persons_repo),
60
- "Article": Article_Harvester(persons_repo, links_repo, themes_repo),
61
- "Softwaresourcecode": Softwaresourcecode_Harvester(
62
- persons_repo, links_repo
63
- ),
64
- "Learningresource": Learningresource_Harvester(
65
- persons_repo, themes_repo
66
- ),
67
- "Metadatastandard": Metadatastandard_Harvester(themes_repo),
68
- "Document": Document_Harvester(persons_repo, links_repo),
69
- "Dataset": Dataset_Harvester(
70
- persons_repo, links_repo, **dataset_options
71
- ),
72
- }
73
- i = 1
74
- document_count = 0
75
- try:
76
- for harvester_name in _harvesters:
77
- if (
78
- requested_harvester
79
- and harvester_name not in requested_harvester
80
- ):
81
- continue
82
- log.info(
83
- f"start harvester {harvester_name} ({i} of {len(_harvesters)}"
84
- )
85
- harvester = _harvesters[harvester_name]
86
- start_time = timeit.default_timer()
87
- documents = harvester.harvest()
88
- elapsed_time = timeit.default_timer() - start_time
89
-
90
- log.info(f"index harvested {len(documents)} documents")
91
- solr.index_documents(documents)
92
- document_count += len(documents)
93
- log.info(
94
- f"finished indexing for harvester {i} of {len(_harvesters)})"
95
- )
96
- stats[harvester.get_type()] = {
97
- "document_count": len(documents),
98
- "processing_time": round(elapsed_time, 4),
99
- }
100
- if harvester.get_notes():
101
- stats[harvester.get_type()]["notes"] = harvester.get_notes()
102
- i += 1
103
- except Exception as e:
104
- data = {
105
- "exception": str(e),
106
- "traceback": traceback.format_exc(),
107
- "hostname": gethostname(),
108
- }
109
- if config["mailserver_url"]:
110
- requests.post(config["mailserver_url"], json=data, verify=False)
111
- else:
112
- log.info(
113
- "harvesting completed, indexed {} documents".format(document_count)
114
- )
115
- requests.post(
116
- config["mailserver_url"],
117
- json={"stats": stats, "hostname": gethostname()},
118
- verify=False,
119
- )
120
-
121
-
122
- if __name__ == "__main__":
123
- run()