udata 7.0.4.dev27782__py2.py3-none-any.whl → 7.0.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

Files changed (71) hide show
  1. udata/__init__.py +1 -1
  2. udata/api/__init__.py +1 -1
  3. udata/core/dataset/api.py +14 -14
  4. udata/core/dataset/api_fields.py +7 -7
  5. udata/core/dataset/apiv2.py +3 -3
  6. udata/core/dataset/rdf.py +43 -1
  7. udata/core/organization/csv.py +27 -1
  8. udata/core/organization/models.py +20 -1
  9. udata/core/organization/tasks.py +61 -1
  10. udata/core/spatial/commands.py +26 -2
  11. udata/core/topic/api.py +6 -0
  12. udata/core/topic/apiv2.py +6 -0
  13. udata/core/topic/forms.py +5 -0
  14. udata/core/topic/models.py +3 -5
  15. udata/forms/fields.py +10 -0
  16. udata/frontend/csv.py +8 -8
  17. udata/harvest/actions.py +11 -0
  18. udata/harvest/api.py +3 -3
  19. udata/harvest/backends/dcat.py +42 -5
  20. udata/harvest/tests/dcat/bnodes.xml +16 -2
  21. udata/harvest/tests/test_dcat_backend.py +87 -1
  22. udata/settings.py +9 -0
  23. udata/static/chunks/{11.c0ccea08914b6b41568e.js → 11.a23c110811a9ac943478.js} +3 -3
  24. udata/static/chunks/{11.c0ccea08914b6b41568e.js.map → 11.a23c110811a9ac943478.js.map} +1 -1
  25. udata/static/chunks/{13.526a25163ababaa44409.js → 13.0889e093f8664e38568c.js} +2 -2
  26. udata/static/chunks/{13.526a25163ababaa44409.js.map → 13.0889e093f8664e38568c.js.map} +1 -1
  27. udata/static/chunks/{16.7901839b4227881947f6.js → 16.f41599478d3e97ad9a30.js} +2 -2
  28. udata/static/chunks/{16.7901839b4227881947f6.js.map → 16.f41599478d3e97ad9a30.js.map} +1 -1
  29. udata/static/chunks/{19.471d5a2a08eef6e5338a.js → 19.2b534a26af8b17e9170b.js} +3 -3
  30. udata/static/chunks/{19.471d5a2a08eef6e5338a.js.map → 19.2b534a26af8b17e9170b.js.map} +1 -1
  31. udata/static/chunks/{5.534e0531d0e2b150146f.js → 5.7115454a1183e5c12eef.js} +3 -3
  32. udata/static/chunks/{5.534e0531d0e2b150146f.js.map → 5.7115454a1183e5c12eef.js.map} +1 -1
  33. udata/static/chunks/{6.e56975229e6065f68d2a.js → 6.16bb24fb8240f2746488.js} +3 -3
  34. udata/static/chunks/{6.e56975229e6065f68d2a.js.map → 6.16bb24fb8240f2746488.js.map} +1 -1
  35. udata/static/chunks/{9.534426728626f11f4571.js → 9.3e752966ff14e47e11f2.js} +2 -2
  36. udata/static/chunks/{9.534426728626f11f4571.js.map → 9.3e752966ff14e47e11f2.js.map} +1 -1
  37. udata/static/common.js +1 -1
  38. udata/static/common.js.map +1 -1
  39. udata/storage/__init__.py +0 -0
  40. udata/storage/s3.py +54 -0
  41. udata/templates/mail/badge_added_association.html +33 -0
  42. udata/templates/mail/badge_added_association.txt +11 -0
  43. udata/templates/mail/badge_added_company.html +33 -0
  44. udata/templates/mail/badge_added_company.txt +11 -0
  45. udata/templates/mail/badge_added_local_authority.html +33 -0
  46. udata/templates/mail/badge_added_local_authority.txt +11 -0
  47. udata/tests/api/test_datasets_api.py +27 -0
  48. udata/tests/api/test_topics_api.py +31 -1
  49. udata/tests/apiv2/test_topics.py +4 -0
  50. udata/tests/organization/test_csv_adapter.py +43 -0
  51. udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
  52. udata/translations/ar/LC_MESSAGES/udata.po +90 -44
  53. udata/translations/de/LC_MESSAGES/udata.mo +0 -0
  54. udata/translations/de/LC_MESSAGES/udata.po +91 -45
  55. udata/translations/es/LC_MESSAGES/udata.mo +0 -0
  56. udata/translations/es/LC_MESSAGES/udata.po +90 -44
  57. udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
  58. udata/translations/fr/LC_MESSAGES/udata.po +91 -45
  59. udata/translations/it/LC_MESSAGES/udata.mo +0 -0
  60. udata/translations/it/LC_MESSAGES/udata.po +90 -44
  61. udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
  62. udata/translations/pt/LC_MESSAGES/udata.po +91 -45
  63. udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
  64. udata/translations/sr/LC_MESSAGES/udata.po +91 -45
  65. udata/translations/udata.pot +91 -45
  66. {udata-7.0.4.dev27782.dist-info → udata-7.0.5.dist-info}/METADATA +20 -3
  67. {udata-7.0.4.dev27782.dist-info → udata-7.0.5.dist-info}/RECORD +71 -62
  68. {udata-7.0.4.dev27782.dist-info → udata-7.0.5.dist-info}/LICENSE +0 -0
  69. {udata-7.0.4.dev27782.dist-info → udata-7.0.5.dist-info}/WHEEL +0 -0
  70. {udata-7.0.4.dev27782.dist-info → udata-7.0.5.dist-info}/entry_points.txt +0 -0
  71. {udata-7.0.4.dev27782.dist-info → udata-7.0.5.dist-info}/top_level.txt +0 -0
udata/harvest/api.py CHANGED
@@ -25,7 +25,7 @@ def backends_ids():
25
25
 
26
26
  error_fields = api.model('HarvestError', {
27
27
  'created_at': fields.ISODateTime(description='The error creation date',
28
- required=True),
28
+ required=True, readonly=True),
29
29
  'message': fields.String(description='The error short message',
30
30
  required=True),
31
31
  'details': fields.String(description='Optional details (ie. stacktrace)'),
@@ -99,7 +99,7 @@ source_fields = api.model('HarvestSource', {
99
99
  required=True),
100
100
  'config': fields.Raw(description='The configuration as key-value pairs'),
101
101
  'created_at': fields.ISODateTime(description='The source creation date',
102
- required=True),
102
+ required=True, readonly=True),
103
103
  'active': fields.Boolean(description='Is this source active',
104
104
  required=True, default=False),
105
105
  'autoarchive': fields.Boolean(
@@ -114,7 +114,7 @@ source_fields = api.model('HarvestSource', {
114
114
  description='The owner information'),
115
115
  'organization': fields.Nested(org_ref_fields, allow_null=True,
116
116
  description='The producer organization'),
117
- 'deleted': fields.ISODateTime(description='The source deletion date'),
117
+ 'deleted': fields.ISODateTime(description='The source deletion date', readonly=True),
118
118
  'schedule': fields.String(description='The source schedule (interval or cron expression)',
119
119
  readonly=True),
120
120
  })
@@ -3,12 +3,17 @@ import logging
3
3
  from rdflib import Graph, URIRef
4
4
  from rdflib.namespace import RDF
5
5
  import xml.etree.ElementTree as ET
6
+ import boto3
7
+ from flask import current_app
8
+ from datetime import date
9
+ import json
6
10
  from typing import List
7
11
 
8
12
  from udata.rdf import (
9
13
  DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf
10
14
  )
11
15
  from udata.core.dataset.rdf import dataset_from_rdf
16
+ from udata.storage.s3 import store_as_json, get_from_json
12
17
 
13
18
  from .base import BaseBackend
14
19
 
@@ -58,10 +63,30 @@ class DcatBackend(BaseBackend):
58
63
  '''List all datasets for a given ...'''
59
64
  fmt = self.get_format()
60
65
  graphs = self.parse_graph(self.source.url, fmt)
61
- self.job.data = {
62
- 'graphs': [graph.serialize(format=fmt, indent=None) for graph in graphs],
63
- 'format': fmt,
64
- }
66
+
67
+ self.job.data = { 'format': fmt }
68
+
69
+ serialized_graphs = [graph.serialize(format=fmt, indent=None) for graph in graphs]
70
+
71
+ # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
72
+ max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO')
73
+ if max_harvest_graph_size_in_mongo is None:
74
+ max_harvest_graph_size_in_mongo = 15 * 1000 * 1000
75
+
76
+ bucket = current_app.config.get('HARVEST_GRAPHS_S3_BUCKET')
77
+
78
+ if bucket is not None and sum([len(g.encode('utf-8')) for g in serialized_graphs]) >= max_harvest_graph_size_in_mongo:
79
+ prefix = current_app.config.get('HARVEST_GRAPHS_S3_FILENAME_PREFIX') or ''
80
+
81
+ # TODO: we could store each page in independant files to allow downloading only the require page in
82
+ # subsequent jobs. (less data to download in each job)
83
+ filename = f'{prefix}harvest_{self.job.id}_{date.today()}.json'
84
+
85
+ store_as_json(bucket, filename, serialized_graphs)
86
+
87
+ self.job.data['filename'] = filename
88
+ else:
89
+ self.job.data['graphs'] = serialized_graphs
65
90
 
66
91
  def get_format(self):
67
92
  fmt = guess_format(self.source.url)
@@ -127,7 +152,19 @@ class DcatBackend(BaseBackend):
127
152
  if item.remote_id == 'None':
128
153
  raise ValueError('The DCT.identifier is missing on this DCAT.Dataset record')
129
154
  graph = Graph(namespace_manager=namespace_manager)
130
- data = self.job.data['graphs'][item.kwargs['page']]
155
+
156
+ if self.job.data.get('graphs') is not None:
157
+ graphs = self.job.data['graphs']
158
+ else:
159
+ bucket = current_app.config.get('HARVEST_GRAPHS_S3_BUCKET')
160
+ if bucket is None:
161
+ raise ValueError(f"No bucket configured but the harvest job item {item.id} on job {self.job.id} doesn't have a graph in MongoDB.")
162
+
163
+ graphs = get_from_json(bucket, self.job.data['filename'])
164
+ if graphs is None:
165
+ raise ValueError(f"The file '{self.job.data['filename']}' is missing in S3 bucket '{bucket}'")
166
+
167
+ data = graphs[item.kwargs['page']]
131
168
  format = self.job.data['format']
132
169
 
133
170
  graph.parse(data=bytes(data, encoding='utf8'), format=format)
@@ -5,6 +5,8 @@
5
5
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
6
6
  xmlns:dcat="http://www.w3.org/ns/dcat#"
7
7
  xmlns:dct="http://purl.org/dc/terms/"
8
+ xmlns:ogc="http://www.opengis.net/ogc"
9
+ xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
8
10
  xmlns:dcterms="http://purl.org/dc/terms/"
9
11
  xmlns:vcard="http://www.w3.org/2006/vcard/ns#"
10
12
  xmlns:schema="http://schema.org/"
@@ -54,7 +56,13 @@
54
56
  <owl:versionInfo>1.0</owl:versionInfo>
55
57
  <dcat:distribution rdf:resource="http://data.test.org/datasets/1/resources/2"/>
56
58
  <dcat:keyword>Tag 4</dcat:keyword>
57
- <dcterms:spatial rdf:resource="http://wuEurope.com/"/>
59
+ <dct:spatial>
60
+ <ogc:Polygon>
61
+ <geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
62
+ wrong wkt
63
+ </geo:asWKT>
64
+ </ogc:Polygon>
65
+ </dct:spatial>
58
66
  <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2016-12-14T19:01:24.184120</dcterms:modified>
59
67
  <dcat:keyword>Tag 2</dcat:keyword>
60
68
  <dcat:keyword>Tag 1</dcat:keyword>
@@ -79,7 +87,13 @@
79
87
  <dcat:keyword>Tag 3</dcat:keyword>
80
88
  <dcat:distribution rdf:resource="http://data.test.org/datasets/2/resources/2"/>
81
89
  <dcterms:title>Dataset 2</dcterms:title>
82
- <dcterms:spatial rdf:resource="http://wuEurope.com/"/>
90
+ <dct:spatial>
91
+ <ogc:Polygon>
92
+ <geo:asWKT rdf:datatype="http://www.opengis.net/rdf#wktLiteral">
93
+ Polygon((4.44641288 45.54214467, 4.44641288 46.01316963, 4.75655252 46.01316963, 4.75655252 45.54214467, 4.44641288 45.54214467))
94
+ </geo:asWKT>
95
+ </ogc:Polygon>
96
+ </dct:spatial>
83
97
  <dcterms:identifier>2</dcterms:identifier>
84
98
  <dct:conformsTo rdf:resource="https://www.ecologie.gouv.fr/sites/default/files/R%C3%A9glementation%20IRVE.pdf" />
85
99
  </dcat:Dataset>
@@ -1,14 +1,18 @@
1
1
  import logging
2
2
  import os
3
+ from flask import current_app
3
4
 
4
5
  import pytest
5
6
 
6
7
  from datetime import date
8
+ import boto3
7
9
  import xml.etree.ElementTree as ET
10
+ from udata.harvest.models import HarvestJob
8
11
 
9
12
  from udata.models import Dataset
10
13
  from udata.core.organization.factories import OrganizationFactory
11
14
  from udata.core.dataset.factories import LicenseFactory, ResourceSchemaMockData
15
+ from udata.storage.s3 import get_from_json
12
16
 
13
17
  from .factories import HarvestSourceFactory
14
18
  from ..backends.dcat import URIS_TO_REPLACE
@@ -136,7 +140,7 @@ class DcatBackendTest:
136
140
  assert datasets['1'].resources[0].format == 'json'
137
141
  assert datasets['1'].resources[0].mime == 'application/json'
138
142
 
139
- @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
143
+ @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas', HARVEST_MAX_CATALOG_SIZE_IN_MONGO=None, HARVEST_GRAPHS_S3_BUCKET="test_bucket", S3_URL="https://example.org", S3_ACCESS_KEY_ID="myUser", S3_SECRET_ACCESS_KEY="password")
140
144
  def test_flat_with_blank_nodes_xml(self, rmock):
141
145
  rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
142
146
 
@@ -156,6 +160,88 @@ class DcatBackendTest:
156
160
  assert len(datasets['1'].resources) == 2
157
161
  assert len(datasets['2'].resources) == 2
158
162
 
163
+ @pytest.mark.skip(reason="Mocking S3 requires `moto` which is not available for our current Python 3.7. We can manually test it.")
164
+ @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas', HARVEST_JOBS_RETENTION_DAYS=0)
165
+ # @mock_s3
166
+ # @pytest.mark.options(HARVEST_MAX_CATALOG_SIZE_IN_MONGO=15, HARVEST_GRAPHS_S3_BUCKET="test_bucket", S3_URL="https://example.org", S3_ACCESS_KEY_ID="myUser", S3_SECRET_ACCESS_KEY="password")
167
+ def test_harvest_big_catalog(self, rmock):
168
+ rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
169
+
170
+ # We need to create the bucket since this is all in Moto's 'virtual' AWS account
171
+ # conn = boto3.resource(
172
+ # "s3",
173
+ # endpoint_url="https://example.org",
174
+ # aws_access_key_id="myUser",
175
+ # aws_secret_access_key="password",
176
+ # )
177
+ # conn.create_bucket(Bucket="test_bucket")
178
+
179
+ filename = 'bnodes.xml'
180
+ url = mock_dcat(rmock, filename)
181
+ org = OrganizationFactory()
182
+ source = HarvestSourceFactory(backend='dcat',
183
+ url=url,
184
+ organization=org)
185
+
186
+ actions.run(source.slug)
187
+
188
+ datasets = {d.harvest.dct_identifier: d for d in Dataset.objects}
189
+
190
+ assert datasets['1'].schema == None
191
+ resources_by_title = { resource['title']: resource for resource in datasets['1'].resources }
192
+
193
+ # Schema with wrong version are considered as external. Maybe we could change this in the future
194
+ assert resources_by_title['Resource 1-2'].schema.url == 'https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/1337.42.0/schema-statique.json'
195
+ assert resources_by_title['Resource 1-2'].schema.name == None
196
+ assert resources_by_title['Resource 1-2'].schema.version == None
197
+
198
+ assert datasets['2'].schema.name == None
199
+ assert datasets['2'].schema.url == 'https://www.ecologie.gouv.fr/sites/default/files/R%C3%A9glementation%20IRVE.pdf'
200
+ resources_by_title = { resource['title']: resource for resource in datasets['2'].resources }
201
+
202
+ # Unknown schema are kept as they were provided
203
+ assert resources_by_title['Resource 2-1'].schema.name == 'Example Schema'
204
+ assert resources_by_title['Resource 2-1'].schema.url == 'https://example.org/schema.json'
205
+ assert resources_by_title['Resource 2-1'].schema.version == None
206
+
207
+ assert resources_by_title['Resource 2-2'].schema == None
208
+
209
+ assert datasets['3'].schema == None
210
+ resources_by_title = { resource['title']: resource for resource in datasets['3'].resources }
211
+
212
+ # If there is just the URL, and it matches a known schema inside the catalog, only set the name and the version
213
+ # (discard the URL)
214
+ assert resources_by_title['Resource 3-1'].schema.name == 'etalab/schema-irve-statique'
215
+ assert resources_by_title['Resource 3-1'].schema.url == None
216
+ assert resources_by_title['Resource 3-1'].schema.version == '2.2.0'
217
+
218
+ job = HarvestJob.objects.order_by('-id').first()
219
+
220
+ assert job.source.slug == source.slug
221
+ assert get_from_json(current_app.config.get('HARVEST_GRAPHS_S3_BUCKET'), job.data['filename']) is not None
222
+
223
+ # Retention is 0 days in config
224
+ actions.purge_jobs()
225
+ assert get_from_json(current_app.config.get('HARVEST_GRAPHS_S3_BUCKET'), job.data['filename']) is None
226
+
227
+
228
+ @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
229
+ def test_harvest_spatial(self, rmock):
230
+ rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
231
+
232
+ filename = 'bnodes.xml'
233
+ url = mock_dcat(rmock, filename)
234
+ org = OrganizationFactory()
235
+ source = HarvestSourceFactory(backend='dcat', url=url, organization=org)
236
+
237
+ actions.run(source.slug)
238
+
239
+ datasets = {d.harvest.dct_identifier: d for d in Dataset.objects}
240
+
241
+ assert datasets['1'].spatial == None
242
+ assert datasets['2'].spatial.geom == {'type': 'MultiPolygon', 'coordinates': [[[[4.44641288, 45.54214467], [4.44641288, 46.01316963], [4.75655252, 46.01316963], [4.75655252, 45.54214467], [4.44641288, 45.54214467]]]]}
243
+ assert datasets['3'].spatial == None
244
+
159
245
  @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
160
246
  def test_harvest_schemas(self, rmock):
161
247
  rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())
udata/settings.py CHANGED
@@ -257,6 +257,15 @@ class Defaults(object):
257
257
 
258
258
  HARVEST_VALIDATION_CONTACT_FORM = None
259
259
 
260
+ HARVEST_MAX_CATALOG_SIZE_IN_MONGO = None # Defaults to the size of a MongoDB document
261
+ HARVEST_GRAPHS_S3_BUCKET = None # If the catalog is bigger than `HARVEST_MAX_CATALOG_SIZE_IN_MONGO` store the graph inside S3 instead of MongoDB
262
+ HARVEST_GRAPHS_S3_FILENAME_PREFIX = '' # Useful to store the graphs inside a subfolder of the bucket. For example by setting `HARVEST_GRAPHS_S3_FILENAME_PREFIX = 'graphs/'`
263
+
264
+ # S3 connection details
265
+ S3_URL = None
266
+ S3_ACCESS_KEY_ID = None
267
+ S3_SECRET_ACCESS_KEY = None
268
+
260
269
  ACTIVATE_TERRITORIES = False
261
270
  # The order is important to compute parents/children, smaller first.
262
271
  HANDLED_LEVELS = tuple()