OneStop4All-Indexer 2.8.0.dev10__tar.gz → 2.8.0.dev12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12/OneStop4All_Indexer.egg-info}/PKG-INFO +2 -1
  2. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/requires.txt +1 -0
  3. {onestop4all_indexer-2.8.0.dev10/OneStop4All_Indexer.egg-info → onestop4all_indexer-2.8.0.dev12}/PKG-INFO +2 -1
  4. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_dataset.py +0 -1
  5. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_organization.py +28 -20
  6. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/setup.py +5 -2
  7. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/solr.py +14 -6
  8. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/LICENSE +0 -0
  9. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/SOURCES.txt +0 -0
  10. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/dependency_links.txt +0 -0
  11. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/entry_points.txt +0 -0
  12. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/OneStop4All_Indexer.egg-info/top_level.txt +0 -0
  13. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/__init__.py +0 -0
  14. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_base.py +0 -0
  15. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_n4eorganization.py +0 -0
  16. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_person.py +0 -0
  17. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_resource_links.py +0 -0
  18. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/data_repositories/repository_theme.py +0 -0
  19. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/__init__.py +0 -0
  20. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_article.py +0 -0
  21. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_base.py +0 -0
  22. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_dataservice.py +0 -0
  23. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_document.py +0 -0
  24. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_learningresource.py +0 -0
  25. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_metadatastandards.py +0 -0
  26. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_repository.py +0 -0
  27. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_service.py +0 -0
  28. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/harvesters/harvester_softwaresourcecode.py +0 -0
  29. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/pyproject.toml +0 -0
  30. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/setup.cfg +0 -0
  31. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/__init__.py +0 -0
  32. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/cli.py +0 -0
  33. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/configs.py +0 -0
  34. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/harvest.py +0 -0
  35. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/sparql.py +0 -0
  36. {onestop4all_indexer-2.8.0.dev10 → onestop4all_indexer-2.8.0.dev12}/utils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev10
3
+ Version: 2.8.0.dev12
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
13
13
  Requires-Dist: shapely~=2.0.5
14
14
  Provides-Extra: airflow
15
15
  Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
16
+ Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
16
17
  Dynamic: author
17
18
  Dynamic: author-email
18
19
  Dynamic: license-file
@@ -7,3 +7,4 @@ shapely~=2.0.5
7
7
 
8
8
  [airflow]
9
9
  apache-airflow==3.1.7
10
+ apache-airflow-providers-amazon[s3fs]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OneStop4All-Indexer
3
- Version: 2.8.0.dev10
3
+ Version: 2.8.0.dev12
4
4
  Summary: Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index
5
5
  Author: Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer
6
6
  Author-email: m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de
@@ -13,6 +13,7 @@ Requires-Dist: geomet~=1.1.0
13
13
  Requires-Dist: shapely~=2.0.5
14
14
  Provides-Extra: airflow
15
15
  Requires-Dist: apache-airflow==3.1.7; extra == "airflow"
16
+ Requires-Dist: apache-airflow-providers-amazon[s3fs]; extra == "airflow"
16
17
  Dynamic: author
17
18
  Dynamic: author-email
18
19
  Dynamic: license-file
@@ -299,7 +299,6 @@ class Dataset_Harvester(HarvesterCordra):
299
299
  value=val,
300
300
  )
301
301
  if "downloadURL" in distribution:
302
- print(distribution["downloadURL"])
303
302
  for download_url in distribution["downloadURL"]:
304
303
  val = self.get_string_from_jsonld(
305
304
  download_url, subject
@@ -1,10 +1,10 @@
1
- from .harvester_base import Harvester
2
- from data_repositories.repository_person import RepositoryPerson
3
- from utils import sparql
4
- from utils import flatten_dict
5
- from utils import is_truthy
6
1
  import logging
7
2
 
3
+ from data_repositories.repository_person import RepositoryPerson
4
+ from utils import sparql, flatten_dict, is_truthy
5
+
6
+ from .harvester_base import Harvester
7
+
8
8
  log = logging.getLogger(__name__)
9
9
 
10
10
 
@@ -20,32 +20,37 @@ class Organization_Harvester(Harvester):
20
20
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
21
21
  PREFIX org: <http://www.w3.org/ns/org#>
22
22
  PREFIX schema: <http://schema.org/>
23
+ PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
23
24
 
24
25
  SELECT ?subject ?predicate ?object ?geo_as_wkt ?isN4EMember
25
26
  WHERE {
26
- ?subject rdf:type foaf:Organization .
27
+ {
28
+ SELECT DISTINCT ?subject
29
+ WHERE {
30
+ ?subject rdf:type foaf:Organization .
31
+ BIND(STRAFTER(STR(?subject), "/objects/") AS ?id)
32
+ FILTER(STRSTARTS(?id, "n4e/"))
33
+ FILTER(!CONTAINS(?id, "jsonPointer"))
34
+ FILTER EXISTS { ?subject n4e:sourceSystem ?source }
35
+ }
36
+ OFFSET %s
37
+ LIMIT %s
38
+ }
39
+
27
40
  ?subject ?predicate ?object .
28
- # Exclude messy data that was harvested from remote source, contains duplicates, etc
29
- FILTER EXISTS { ?subject n4e:sourceSystem ?source }
41
+
30
42
  OPTIONAL {
31
43
  ?subject geo:hasGeometry ?geometry .
32
44
  ?geometry geo:asWKT ?geo_as_wkt .
33
45
  }
46
+
34
47
  OPTIONAL {
35
- ?subject n4e:sourceSystem ?source_system .
36
- OPTIONAL { ?source_system dct:title ?sourceSystem_title . }
37
- OPTIONAL { ?source_system foaf:homepage ?sourceSystem_homepage . }
38
- }
39
- OPTIONAL {
40
- ?subject rdf:type foaf:Organization ;
41
- org:hasMembership ?membership .
48
+ ?subject org:hasMembership ?membership .
42
49
  ?membership org:organization ?n4eproject .
43
50
  ?n4eproject schema:url "https://nfdi4earth.de/"^^xsd:anyURI .
44
51
  BIND(true AS ?isN4EMember)
45
52
  }
46
53
  }
47
- OFFSET %d
48
- LIMIT %d
49
54
  """
50
55
 
51
56
  def __init__(self, persons_repo: RepositoryPerson, **kw):
@@ -150,13 +155,16 @@ class Organization_Harvester(Harvester):
150
155
  # set geometry if available and not already set
151
156
  if (
152
157
  predicate == "http://www.opengis.net/ont/geosparql#hasGeometry"
153
- and ("geometry" not in organizations[subject]
154
- or hit["geo_as_wkt"]["value"] not in organizations[subject]["geometry"])
158
+ and (
159
+ "geometry" not in organizations[subject]
160
+ or hit["geo_as_wkt"]["value"]
161
+ not in organizations[subject]["geometry"]
162
+ )
155
163
  ):
156
164
  self.addValue(
157
165
  dict=organizations[subject],
158
166
  attribute="geometry",
159
- value=hit["geo_as_wkt"]["value"]
167
+ value=hit["geo_as_wkt"]["value"],
160
168
  )
161
169
 
162
170
  # set membership in N4E project
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="OneStop4All-Indexer",
5
- version="2.8.0.dev10",
5
+ version="2.8.0.dev12",
6
6
  description="Library to harvest data from NFDI4Earth-KnowledgeHub to OneStop4All-Index",
7
7
  author="Markus Konkol, Arne Vogt, Tom Niers, Ralf Klammer",
8
8
  author_email="m.konkol@52north.org, a.vogt@52north.org, tom.niers@tu-dresden.de, ralf.klammer@tu-dresden.de",
@@ -16,7 +16,10 @@ setup(
16
16
  "shapely ~= 2.0.5",
17
17
  ],
18
18
  extras_require={
19
- "airflow": ["apache-airflow==3.1.7"],
19
+ "airflow": [
20
+ "apache-airflow==3.1.7",
21
+ "apache-airflow-providers-amazon[s3fs]",
22
+ ],
20
23
  },
21
24
  include_package_data=True,
22
25
  entry_points={
@@ -57,7 +57,9 @@ class Solr(object):
57
57
  )
58
58
  return HTTPBasicAuth(username, password)
59
59
 
60
- def index_documents(self, documents: List[Dict]) -> None:
60
+ def index_documents(
61
+ self, documents: List[Dict], commit=True, ping=True
62
+ ) -> None:
61
63
  # solr_endpoint = coreurl(solr_url, solr_core)
62
64
  log.info(f"start indexing {len(documents)} documents")
63
65
  batch_size = 50000
@@ -87,13 +89,19 @@ class Solr(object):
87
89
  if len(batch) < batch_size:
88
90
  break
89
91
 
90
- log.info("commit changes to index")
91
- self.client.ping()
92
- log.info("solr healtcheck successful")
93
- self.client.commit()
94
- log.info("sucessfully commited changes to index")
92
+ if ping is True:
93
+ self.client.ping()
94
+ log.info("solr healtcheck successful")
95
+
96
+ if commit is True:
97
+ log.info("commit changes to index")
98
+ self.client.commit()
99
+ log.info("sucessfully commited changes to index")
95
100
  log.info("finished indexing")
96
101
 
102
+ def commit(self):
103
+ self.client.commit()
104
+
97
105
  def reset_index(self):
98
106
  self.client.delete(q="*:*")
99
107
  self.client.commit()