udata 9.1.2.dev30355__py2.py3-none-any.whl → 9.1.2.dev30454__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of udata might be problematic. Click here for more details.
- tasks/__init__.py +109 -107
- tasks/helpers.py +18 -18
- udata/__init__.py +4 -4
- udata/admin/views.py +5 -5
- udata/api/__init__.py +111 -134
- udata/api/commands.py +45 -37
- udata/api/errors.py +5 -4
- udata/api/fields.py +23 -21
- udata/api/oauth2.py +55 -74
- udata/api/parsers.py +15 -15
- udata/api/signals.py +1 -1
- udata/api_fields.py +137 -89
- udata/app.py +58 -55
- udata/assets.py +5 -5
- udata/auth/__init__.py +37 -26
- udata/auth/forms.py +23 -15
- udata/auth/helpers.py +1 -1
- udata/auth/mails.py +3 -3
- udata/auth/password_validation.py +19 -15
- udata/auth/views.py +94 -68
- udata/commands/__init__.py +71 -69
- udata/commands/cache.py +7 -7
- udata/commands/db.py +201 -140
- udata/commands/dcat.py +36 -30
- udata/commands/fixtures.py +100 -84
- udata/commands/images.py +21 -20
- udata/commands/info.py +17 -20
- udata/commands/init.py +10 -10
- udata/commands/purge.py +12 -13
- udata/commands/serve.py +41 -29
- udata/commands/static.py +16 -18
- udata/commands/test.py +20 -20
- udata/commands/tests/fixtures.py +26 -24
- udata/commands/worker.py +31 -33
- udata/core/__init__.py +12 -12
- udata/core/activity/__init__.py +0 -1
- udata/core/activity/api.py +59 -49
- udata/core/activity/models.py +28 -26
- udata/core/activity/signals.py +1 -1
- udata/core/activity/tasks.py +16 -10
- udata/core/badges/api.py +6 -6
- udata/core/badges/commands.py +14 -13
- udata/core/badges/fields.py +8 -5
- udata/core/badges/forms.py +7 -4
- udata/core/badges/models.py +16 -31
- udata/core/badges/permissions.py +1 -3
- udata/core/badges/signals.py +2 -2
- udata/core/badges/tasks.py +3 -2
- udata/core/badges/tests/test_commands.py +10 -10
- udata/core/badges/tests/test_model.py +24 -31
- udata/core/contact_point/api.py +19 -18
- udata/core/contact_point/api_fields.py +21 -14
- udata/core/contact_point/factories.py +2 -2
- udata/core/contact_point/forms.py +7 -6
- udata/core/contact_point/models.py +3 -5
- udata/core/dataservices/api.py +26 -21
- udata/core/dataservices/factories.py +13 -11
- udata/core/dataservices/models.py +35 -40
- udata/core/dataservices/permissions.py +4 -4
- udata/core/dataservices/rdf.py +40 -17
- udata/core/dataservices/tasks.py +4 -3
- udata/core/dataset/actions.py +10 -10
- udata/core/dataset/activities.py +21 -23
- udata/core/dataset/api.py +321 -298
- udata/core/dataset/api_fields.py +443 -271
- udata/core/dataset/apiv2.py +305 -229
- udata/core/dataset/commands.py +38 -36
- udata/core/dataset/constants.py +61 -54
- udata/core/dataset/csv.py +70 -74
- udata/core/dataset/events.py +39 -32
- udata/core/dataset/exceptions.py +8 -4
- udata/core/dataset/factories.py +57 -65
- udata/core/dataset/forms.py +87 -63
- udata/core/dataset/models.py +336 -280
- udata/core/dataset/permissions.py +9 -6
- udata/core/dataset/preview.py +15 -17
- udata/core/dataset/rdf.py +156 -122
- udata/core/dataset/search.py +92 -77
- udata/core/dataset/signals.py +1 -1
- udata/core/dataset/tasks.py +63 -54
- udata/core/discussions/actions.py +5 -5
- udata/core/discussions/api.py +124 -120
- udata/core/discussions/factories.py +2 -2
- udata/core/discussions/forms.py +9 -7
- udata/core/discussions/metrics.py +1 -3
- udata/core/discussions/models.py +25 -24
- udata/core/discussions/notifications.py +18 -14
- udata/core/discussions/permissions.py +3 -3
- udata/core/discussions/signals.py +4 -4
- udata/core/discussions/tasks.py +24 -28
- udata/core/followers/api.py +32 -33
- udata/core/followers/models.py +9 -9
- udata/core/followers/signals.py +3 -3
- udata/core/jobs/actions.py +7 -7
- udata/core/jobs/api.py +99 -92
- udata/core/jobs/commands.py +48 -49
- udata/core/jobs/forms.py +11 -11
- udata/core/jobs/models.py +6 -6
- udata/core/metrics/__init__.py +2 -2
- udata/core/metrics/commands.py +34 -30
- udata/core/metrics/models.py +2 -4
- udata/core/metrics/signals.py +1 -1
- udata/core/metrics/tasks.py +3 -3
- udata/core/organization/activities.py +12 -15
- udata/core/organization/api.py +167 -174
- udata/core/organization/api_fields.py +183 -124
- udata/core/organization/apiv2.py +32 -32
- udata/core/organization/commands.py +20 -22
- udata/core/organization/constants.py +11 -11
- udata/core/organization/csv.py +17 -15
- udata/core/organization/factories.py +8 -11
- udata/core/organization/forms.py +32 -26
- udata/core/organization/metrics.py +2 -1
- udata/core/organization/models.py +87 -67
- udata/core/organization/notifications.py +18 -14
- udata/core/organization/permissions.py +10 -11
- udata/core/organization/rdf.py +14 -14
- udata/core/organization/search.py +30 -28
- udata/core/organization/signals.py +7 -7
- udata/core/organization/tasks.py +42 -61
- udata/core/owned.py +38 -27
- udata/core/post/api.py +82 -81
- udata/core/post/constants.py +8 -5
- udata/core/post/factories.py +4 -4
- udata/core/post/forms.py +13 -14
- udata/core/post/models.py +20 -22
- udata/core/post/tests/test_api.py +30 -32
- udata/core/reports/api.py +8 -7
- udata/core/reports/constants.py +1 -3
- udata/core/reports/models.py +10 -10
- udata/core/reuse/activities.py +15 -19
- udata/core/reuse/api.py +123 -126
- udata/core/reuse/api_fields.py +120 -85
- udata/core/reuse/apiv2.py +11 -10
- udata/core/reuse/constants.py +23 -23
- udata/core/reuse/csv.py +18 -18
- udata/core/reuse/factories.py +5 -9
- udata/core/reuse/forms.py +24 -21
- udata/core/reuse/models.py +55 -51
- udata/core/reuse/permissions.py +2 -2
- udata/core/reuse/search.py +49 -46
- udata/core/reuse/signals.py +1 -1
- udata/core/reuse/tasks.py +4 -5
- udata/core/site/api.py +47 -50
- udata/core/site/factories.py +2 -2
- udata/core/site/forms.py +4 -5
- udata/core/site/models.py +94 -63
- udata/core/site/rdf.py +14 -14
- udata/core/spam/api.py +16 -9
- udata/core/spam/constants.py +4 -4
- udata/core/spam/fields.py +13 -7
- udata/core/spam/models.py +27 -20
- udata/core/spam/signals.py +1 -1
- udata/core/spam/tests/test_spam.py +6 -5
- udata/core/spatial/api.py +72 -80
- udata/core/spatial/api_fields.py +73 -58
- udata/core/spatial/commands.py +67 -64
- udata/core/spatial/constants.py +3 -3
- udata/core/spatial/factories.py +37 -54
- udata/core/spatial/forms.py +27 -26
- udata/core/spatial/geoids.py +17 -17
- udata/core/spatial/models.py +43 -47
- udata/core/spatial/tasks.py +2 -1
- udata/core/spatial/tests/test_api.py +115 -130
- udata/core/spatial/tests/test_fields.py +74 -77
- udata/core/spatial/tests/test_geoid.py +22 -22
- udata/core/spatial/tests/test_models.py +5 -7
- udata/core/spatial/translations.py +16 -16
- udata/core/storages/__init__.py +16 -18
- udata/core/storages/api.py +66 -64
- udata/core/storages/tasks.py +7 -7
- udata/core/storages/utils.py +15 -15
- udata/core/storages/views.py +5 -6
- udata/core/tags/api.py +17 -14
- udata/core/tags/csv.py +4 -4
- udata/core/tags/models.py +8 -5
- udata/core/tags/tasks.py +11 -13
- udata/core/tags/views.py +4 -4
- udata/core/topic/api.py +84 -73
- udata/core/topic/apiv2.py +157 -127
- udata/core/topic/factories.py +3 -4
- udata/core/topic/forms.py +12 -14
- udata/core/topic/models.py +14 -19
- udata/core/topic/parsers.py +26 -26
- udata/core/user/activities.py +30 -29
- udata/core/user/api.py +151 -152
- udata/core/user/api_fields.py +132 -100
- udata/core/user/apiv2.py +7 -7
- udata/core/user/commands.py +38 -38
- udata/core/user/factories.py +8 -9
- udata/core/user/forms.py +14 -11
- udata/core/user/metrics.py +2 -2
- udata/core/user/models.py +68 -69
- udata/core/user/permissions.py +4 -5
- udata/core/user/rdf.py +7 -8
- udata/core/user/tasks.py +2 -2
- udata/core/user/tests/test_user_model.py +24 -16
- udata/cors.py +99 -0
- udata/db/tasks.py +2 -1
- udata/entrypoints.py +35 -31
- udata/errors.py +2 -1
- udata/event/values.py +6 -6
- udata/factories.py +2 -2
- udata/features/identicon/api.py +5 -6
- udata/features/identicon/backends.py +48 -55
- udata/features/identicon/tests/test_backends.py +4 -5
- udata/features/notifications/__init__.py +0 -1
- udata/features/notifications/actions.py +9 -9
- udata/features/notifications/api.py +17 -13
- udata/features/territories/__init__.py +12 -10
- udata/features/territories/api.py +14 -15
- udata/features/territories/models.py +23 -28
- udata/features/transfer/actions.py +8 -11
- udata/features/transfer/api.py +84 -77
- udata/features/transfer/factories.py +2 -1
- udata/features/transfer/models.py +11 -12
- udata/features/transfer/notifications.py +19 -15
- udata/features/transfer/permissions.py +5 -5
- udata/forms/__init__.py +5 -2
- udata/forms/fields.py +164 -172
- udata/forms/validators.py +19 -22
- udata/forms/widgets.py +9 -13
- udata/frontend/__init__.py +31 -26
- udata/frontend/csv.py +68 -58
- udata/frontend/markdown.py +40 -44
- udata/harvest/actions.py +89 -77
- udata/harvest/api.py +294 -238
- udata/harvest/backends/__init__.py +4 -4
- udata/harvest/backends/base.py +128 -111
- udata/harvest/backends/dcat.py +80 -66
- udata/harvest/commands.py +56 -60
- udata/harvest/csv.py +8 -8
- udata/harvest/exceptions.py +6 -3
- udata/harvest/filters.py +24 -23
- udata/harvest/forms.py +27 -28
- udata/harvest/models.py +88 -80
- udata/harvest/notifications.py +15 -10
- udata/harvest/signals.py +13 -13
- udata/harvest/tasks.py +11 -10
- udata/harvest/tests/factories.py +23 -24
- udata/harvest/tests/test_actions.py +136 -166
- udata/harvest/tests/test_api.py +220 -214
- udata/harvest/tests/test_base_backend.py +117 -112
- udata/harvest/tests/test_dcat_backend.py +380 -308
- udata/harvest/tests/test_filters.py +33 -22
- udata/harvest/tests/test_models.py +11 -14
- udata/harvest/tests/test_notifications.py +6 -7
- udata/harvest/tests/test_tasks.py +7 -6
- udata/i18n.py +237 -78
- udata/linkchecker/backends.py +5 -11
- udata/linkchecker/checker.py +23 -22
- udata/linkchecker/commands.py +4 -6
- udata/linkchecker/models.py +6 -6
- udata/linkchecker/tasks.py +18 -20
- udata/mail.py +21 -21
- udata/migrations/2020-07-24-remove-s-from-scope-oauth.py +9 -8
- udata/migrations/2020-08-24-add-fs-filename.py +9 -8
- udata/migrations/2020-09-28-update-reuses-datasets-metrics.py +5 -4
- udata/migrations/2020-10-16-migrate-ods-resources.py +9 -10
- udata/migrations/2021-04-08-update-schema-with-new-structure.py +8 -7
- udata/migrations/2021-05-27-fix-default-schema-name.py +7 -6
- udata/migrations/2021-07-05-remove-unused-badges.py +17 -15
- udata/migrations/2021-07-07-update-schema-for-community-resources.py +7 -6
- udata/migrations/2021-08-17-follow-integrity.py +5 -4
- udata/migrations/2021-08-17-harvest-integrity.py +13 -12
- udata/migrations/2021-08-17-oauth2client-integrity.py +5 -4
- udata/migrations/2021-08-17-transfer-integrity.py +5 -4
- udata/migrations/2021-08-17-users-integrity.py +9 -8
- udata/migrations/2021-12-14-reuse-topics.py +7 -6
- udata/migrations/2022-04-21-improve-extension-detection.py +8 -7
- udata/migrations/2022-09-22-clean-inactive-harvest-datasets.py +16 -14
- udata/migrations/2022-10-10-add-fs_uniquifier-to-user-model.py +6 -6
- udata/migrations/2022-10-10-migrate-harvest-extras.py +36 -26
- udata/migrations/2023-02-08-rename-internal-dates.py +46 -28
- udata/migrations/2024-01-29-fix-reuse-and-dataset-with-private-None.py +10 -8
- udata/migrations/2024-03-22-migrate-activity-kwargs-to-extras.py +6 -4
- udata/migrations/2024-06-11-fix-reuse-datasets-references.py +7 -6
- udata/migrations/__init__.py +123 -105
- udata/models/__init__.py +4 -4
- udata/mongo/__init__.py +13 -11
- udata/mongo/badges_field.py +3 -2
- udata/mongo/datetime_fields.py +13 -12
- udata/mongo/document.py +17 -16
- udata/mongo/engine.py +15 -16
- udata/mongo/errors.py +2 -1
- udata/mongo/extras_fields.py +30 -20
- udata/mongo/queryset.py +12 -12
- udata/mongo/slug_fields.py +38 -28
- udata/mongo/taglist_field.py +1 -2
- udata/mongo/url_field.py +5 -5
- udata/mongo/uuid_fields.py +4 -3
- udata/notifications/__init__.py +1 -1
- udata/notifications/mattermost.py +10 -9
- udata/rdf.py +167 -188
- udata/routing.py +40 -45
- udata/search/__init__.py +18 -19
- udata/search/adapter.py +17 -16
- udata/search/commands.py +44 -51
- udata/search/fields.py +13 -20
- udata/search/query.py +23 -18
- udata/search/result.py +9 -10
- udata/sentry.py +21 -19
- udata/settings.py +262 -198
- udata/sitemap.py +8 -6
- udata/storage/s3.py +20 -13
- udata/tags.py +4 -5
- udata/tasks.py +43 -42
- udata/tests/__init__.py +9 -6
- udata/tests/api/__init__.py +8 -6
- udata/tests/api/test_auth_api.py +395 -321
- udata/tests/api/test_base_api.py +33 -35
- udata/tests/api/test_contact_points.py +7 -9
- udata/tests/api/test_dataservices_api.py +211 -158
- udata/tests/api/test_datasets_api.py +823 -812
- udata/tests/api/test_follow_api.py +13 -15
- udata/tests/api/test_me_api.py +95 -112
- udata/tests/api/test_organizations_api.py +301 -339
- udata/tests/api/test_reports_api.py +35 -25
- udata/tests/api/test_reuses_api.py +134 -139
- udata/tests/api/test_swagger.py +5 -5
- udata/tests/api/test_tags_api.py +18 -25
- udata/tests/api/test_topics_api.py +94 -94
- udata/tests/api/test_transfer_api.py +53 -48
- udata/tests/api/test_user_api.py +128 -141
- udata/tests/apiv2/test_datasets.py +290 -198
- udata/tests/apiv2/test_me_api.py +10 -11
- udata/tests/apiv2/test_organizations.py +56 -74
- udata/tests/apiv2/test_swagger.py +5 -5
- udata/tests/apiv2/test_topics.py +69 -87
- udata/tests/cli/test_cli_base.py +8 -8
- udata/tests/cli/test_db_cli.py +21 -19
- udata/tests/dataservice/test_dataservice_tasks.py +8 -12
- udata/tests/dataset/test_csv_adapter.py +44 -35
- udata/tests/dataset/test_dataset_actions.py +2 -3
- udata/tests/dataset/test_dataset_commands.py +7 -8
- udata/tests/dataset/test_dataset_events.py +36 -29
- udata/tests/dataset/test_dataset_model.py +224 -217
- udata/tests/dataset/test_dataset_rdf.py +142 -131
- udata/tests/dataset/test_dataset_tasks.py +15 -15
- udata/tests/dataset/test_resource_preview.py +10 -13
- udata/tests/features/territories/__init__.py +9 -13
- udata/tests/features/territories/test_territories_api.py +71 -91
- udata/tests/forms/test_basic_fields.py +7 -7
- udata/tests/forms/test_current_user_field.py +39 -66
- udata/tests/forms/test_daterange_field.py +31 -39
- udata/tests/forms/test_dict_field.py +28 -26
- udata/tests/forms/test_extras_fields.py +102 -76
- udata/tests/forms/test_form_field.py +8 -8
- udata/tests/forms/test_image_field.py +33 -26
- udata/tests/forms/test_model_field.py +134 -123
- udata/tests/forms/test_model_list_field.py +7 -7
- udata/tests/forms/test_nested_model_list_field.py +117 -79
- udata/tests/forms/test_publish_as_field.py +36 -65
- udata/tests/forms/test_reference_field.py +34 -53
- udata/tests/forms/test_user_forms.py +23 -21
- udata/tests/forms/test_uuid_field.py +6 -10
- udata/tests/frontend/__init__.py +9 -6
- udata/tests/frontend/test_auth.py +7 -6
- udata/tests/frontend/test_csv.py +81 -96
- udata/tests/frontend/test_hooks.py +43 -43
- udata/tests/frontend/test_markdown.py +211 -191
- udata/tests/helpers.py +32 -37
- udata/tests/models.py +2 -2
- udata/tests/organization/test_csv_adapter.py +21 -16
- udata/tests/organization/test_notifications.py +11 -18
- udata/tests/organization/test_organization_model.py +13 -13
- udata/tests/organization/test_organization_rdf.py +29 -22
- udata/tests/organization/test_organization_tasks.py +16 -17
- udata/tests/plugin.py +79 -73
- udata/tests/reuse/test_reuse_model.py +21 -21
- udata/tests/reuse/test_reuse_task.py +11 -13
- udata/tests/search/__init__.py +11 -12
- udata/tests/search/test_adapter.py +60 -70
- udata/tests/search/test_query.py +16 -16
- udata/tests/search/test_results.py +10 -7
- udata/tests/site/test_site_api.py +11 -16
- udata/tests/site/test_site_metrics.py +20 -30
- udata/tests/site/test_site_model.py +4 -5
- udata/tests/site/test_site_rdf.py +94 -78
- udata/tests/test_activity.py +17 -17
- udata/tests/test_cors.py +62 -0
- udata/tests/test_discussions.py +292 -299
- udata/tests/test_i18n.py +37 -40
- udata/tests/test_linkchecker.py +91 -85
- udata/tests/test_mail.py +13 -17
- udata/tests/test_migrations.py +219 -180
- udata/tests/test_model.py +164 -157
- udata/tests/test_notifications.py +17 -17
- udata/tests/test_owned.py +14 -14
- udata/tests/test_rdf.py +25 -23
- udata/tests/test_routing.py +89 -93
- udata/tests/test_storages.py +137 -128
- udata/tests/test_tags.py +44 -46
- udata/tests/test_topics.py +7 -7
- udata/tests/test_transfer.py +42 -49
- udata/tests/test_uris.py +160 -161
- udata/tests/test_utils.py +79 -71
- udata/tests/user/test_user_rdf.py +5 -9
- udata/tests/workers/test_jobs_commands.py +57 -58
- udata/tests/workers/test_tasks_routing.py +23 -29
- udata/tests/workers/test_workers_api.py +125 -131
- udata/tests/workers/test_workers_helpers.py +6 -6
- udata/tracking.py +4 -6
- udata/uris.py +45 -46
- udata/utils.py +68 -66
- udata/wsgi.py +1 -1
- {udata-9.1.2.dev30355.dist-info → udata-9.1.2.dev30454.dist-info}/METADATA +7 -3
- udata-9.1.2.dev30454.dist-info/RECORD +706 -0
- udata-9.1.2.dev30355.dist-info/RECORD +0 -704
- {udata-9.1.2.dev30355.dist-info → udata-9.1.2.dev30454.dist-info}/LICENSE +0 -0
- {udata-9.1.2.dev30355.dist-info → udata-9.1.2.dev30454.dist-info}/WHEEL +0 -0
- {udata-9.1.2.dev30355.dist-info → udata-9.1.2.dev30454.dist-info}/entry_points.txt +0 -0
- {udata-9.1.2.dev30355.dist-info → udata-9.1.2.dev30454.dist-info}/top_level.txt +0 -0
udata/harvest/backends/dcat.py
CHANGED
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from datetime import date
|
|
3
|
+
from typing import Generator
|
|
2
4
|
|
|
3
|
-
from rdflib import Graph
|
|
4
|
-
from rdflib.namespace import RDF
|
|
5
5
|
import lxml.etree as ET
|
|
6
6
|
from flask import current_app
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from rdflib import Graph
|
|
8
|
+
from rdflib.namespace import RDF
|
|
9
9
|
|
|
10
|
+
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
11
|
+
from udata.core.dataset.rdf import dataset_from_rdf
|
|
12
|
+
from udata.harvest.models import HarvestItem
|
|
10
13
|
from udata.rdf import (
|
|
11
|
-
DCAT,
|
|
14
|
+
DCAT,
|
|
15
|
+
DCT,
|
|
16
|
+
HYDRA,
|
|
17
|
+
SPDX,
|
|
18
|
+
guess_format,
|
|
19
|
+
namespace_manager,
|
|
20
|
+
url_from_rdf,
|
|
12
21
|
)
|
|
13
|
-
from udata.core.dataset.rdf import dataset_from_rdf
|
|
14
|
-
from udata.core.dataservices.rdf import dataservice_from_rdf
|
|
15
22
|
from udata.storage.s3 import store_as_json
|
|
16
|
-
from udata.harvest.models import HarvestItem
|
|
17
23
|
|
|
18
24
|
from .base import BaseBackend
|
|
19
25
|
|
|
@@ -23,9 +29,7 @@ log = logging.getLogger(__name__)
|
|
|
23
29
|
# Attributes representing nested classes to be stored in the graph
|
|
24
30
|
# in order to have a complete graph
|
|
25
31
|
DCAT_NESTING = {
|
|
26
|
-
DCAT.distribution: {
|
|
27
|
-
SPDX.checksum: {}
|
|
28
|
-
},
|
|
32
|
+
DCAT.distribution: {SPDX.checksum: {}},
|
|
29
33
|
DCT.temporal: {},
|
|
30
34
|
DCT.spatial: {},
|
|
31
35
|
}
|
|
@@ -36,16 +40,16 @@ DCAT_NESTING[DCAT.distributions] = DCAT_NESTING[DCAT.distribution]
|
|
|
36
40
|
# Known pagination class and their next page property
|
|
37
41
|
KNOWN_PAGINATION = (
|
|
38
42
|
(HYDRA.PartialCollectionView, HYDRA.next),
|
|
39
|
-
(HYDRA.PagedCollection, HYDRA.nextPage)
|
|
43
|
+
(HYDRA.PagedCollection, HYDRA.nextPage),
|
|
40
44
|
)
|
|
41
45
|
|
|
42
|
-
CSW_NAMESPACE =
|
|
43
|
-
OWS_NAMESPACE =
|
|
46
|
+
CSW_NAMESPACE = "http://www.opengis.net/cat/csw/2.0.2"
|
|
47
|
+
OWS_NAMESPACE = "http://www.opengis.net/ows"
|
|
44
48
|
|
|
45
49
|
# Useful to patch essential failing URIs
|
|
46
50
|
URIS_TO_REPLACE = {
|
|
47
51
|
# See https://github.com/etalab/data.gouv.fr/issues/1151
|
|
48
|
-
|
|
52
|
+
"https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld": "https://gist.githubusercontent.com/maudetes/f019586185d6f59dcfb07f97148a1973/raw/585c3c7bf602b5a4e635b137257d0619792e2c1f/gistfile1.txt" # noqa
|
|
49
53
|
}
|
|
50
54
|
|
|
51
55
|
|
|
@@ -57,11 +61,11 @@ def extract_graph(source, target, node, specs):
|
|
|
57
61
|
|
|
58
62
|
|
|
59
63
|
class DcatBackend(BaseBackend):
|
|
60
|
-
display_name =
|
|
64
|
+
display_name = "DCAT"
|
|
61
65
|
|
|
62
66
|
def inner_harvest(self):
|
|
63
67
|
fmt = self.get_format()
|
|
64
|
-
self.job.data = {
|
|
68
|
+
self.job.data = {"format": fmt}
|
|
65
69
|
|
|
66
70
|
serialized_graphs = []
|
|
67
71
|
|
|
@@ -73,24 +77,30 @@ class DcatBackend(BaseBackend):
|
|
|
73
77
|
self.process_one_dataservices_page(page_number, page)
|
|
74
78
|
|
|
75
79
|
# The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000).
|
|
76
|
-
max_harvest_graph_size_in_mongo = current_app.config.get(
|
|
80
|
+
max_harvest_graph_size_in_mongo = current_app.config.get(
|
|
81
|
+
"HARVEST_MAX_CATALOG_SIZE_IN_MONGO"
|
|
82
|
+
)
|
|
77
83
|
if max_harvest_graph_size_in_mongo is None:
|
|
78
84
|
max_harvest_graph_size_in_mongo = 15 * 1000 * 1000
|
|
79
85
|
|
|
80
|
-
bucket = current_app.config.get(
|
|
86
|
+
bucket = current_app.config.get("HARVEST_GRAPHS_S3_BUCKET")
|
|
81
87
|
|
|
82
|
-
if
|
|
83
|
-
|
|
88
|
+
if (
|
|
89
|
+
bucket is not None
|
|
90
|
+
and sum([len(g.encode("utf-8")) for g in serialized_graphs])
|
|
91
|
+
>= max_harvest_graph_size_in_mongo
|
|
92
|
+
):
|
|
93
|
+
prefix = current_app.config.get("HARVEST_GRAPHS_S3_FILENAME_PREFIX") or ""
|
|
84
94
|
|
|
85
95
|
# TODO: we could store each page in independant files to allow downloading only the require page in
|
|
86
96
|
# subsequent jobs. (less data to download in each job)
|
|
87
|
-
filename = f
|
|
97
|
+
filename = f"{prefix}harvest_{self.job.id}_{date.today()}.json"
|
|
88
98
|
|
|
89
99
|
store_as_json(bucket, filename, serialized_graphs)
|
|
90
100
|
|
|
91
|
-
self.job.data[
|
|
101
|
+
self.job.data["filename"] = filename
|
|
92
102
|
else:
|
|
93
|
-
self.job.data[
|
|
103
|
+
self.job.data["graphs"] = serialized_graphs
|
|
94
104
|
|
|
95
105
|
def get_format(self):
|
|
96
106
|
fmt = guess_format(self.source.url)
|
|
@@ -99,9 +109,9 @@ class DcatBackend(BaseBackend):
|
|
|
99
109
|
if not fmt:
|
|
100
110
|
response = self.head(self.source.url)
|
|
101
111
|
response.raise_for_status()
|
|
102
|
-
mime_type = response.headers.get(
|
|
112
|
+
mime_type = response.headers.get("Content-Type", "").split(";", 1)[0]
|
|
103
113
|
if not mime_type:
|
|
104
|
-
msg =
|
|
114
|
+
msg = "Unable to detect format from extension or mime type"
|
|
105
115
|
raise ValueError(msg)
|
|
106
116
|
fmt = guess_format(mime_type)
|
|
107
117
|
if not fmt:
|
|
@@ -136,7 +146,7 @@ class DcatBackend(BaseBackend):
|
|
|
136
146
|
return
|
|
137
147
|
|
|
138
148
|
page_number += 1
|
|
139
|
-
|
|
149
|
+
|
|
140
150
|
def process_one_datasets_page(self, page_number: int, page: Graph):
|
|
141
151
|
for node in page.subjects(RDF.type, DCAT.Dataset):
|
|
142
152
|
remote_id = page.value(node, DCT.identifier)
|
|
@@ -152,47 +162,45 @@ class DcatBackend(BaseBackend):
|
|
|
152
162
|
|
|
153
163
|
if self.is_done():
|
|
154
164
|
return
|
|
155
|
-
|
|
165
|
+
|
|
156
166
|
def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
157
|
-
item.kwargs[
|
|
167
|
+
item.kwargs["page_number"] = page_number
|
|
158
168
|
|
|
159
169
|
dataset = self.get_dataset(item.remote_id)
|
|
160
170
|
return dataset_from_rdf(page, dataset, node=node)
|
|
161
171
|
|
|
162
172
|
def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node):
|
|
163
|
-
item.kwargs[
|
|
173
|
+
item.kwargs["page_number"] = page_number
|
|
164
174
|
|
|
165
175
|
dataservice = self.get_dataservice(item.remote_id)
|
|
166
|
-
return dataservice_from_rdf(
|
|
176
|
+
return dataservice_from_rdf(
|
|
177
|
+
page, dataservice, node, [item.dataset for item in self.job.items]
|
|
178
|
+
)
|
|
167
179
|
|
|
168
180
|
def get_node_from_item(self, graph, item):
|
|
169
181
|
for node in graph.subjects(RDF.type, DCAT.Dataset):
|
|
170
182
|
if str(graph.value(node, DCT.identifier)) == item.remote_id:
|
|
171
183
|
return node
|
|
172
|
-
raise ValueError(f
|
|
184
|
+
raise ValueError(f"Unable to find dataset with DCT.identifier:{item.remote_id}")
|
|
173
185
|
|
|
174
186
|
def next_record_if_should_continue(self, start, search_results):
|
|
175
|
-
next_record = int(search_results.attrib[
|
|
176
|
-
matched_count = int(search_results.attrib[
|
|
177
|
-
returned_count = int(search_results.attrib[
|
|
187
|
+
next_record = int(search_results.attrib["nextRecord"])
|
|
188
|
+
matched_count = int(search_results.attrib["numberOfRecordsMatched"])
|
|
189
|
+
returned_count = int(search_results.attrib["numberOfRecordsReturned"])
|
|
178
190
|
|
|
179
191
|
# Break conditions copied gratefully from
|
|
180
192
|
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
|
|
181
193
|
break_conditions = (
|
|
182
194
|
# standard CSW: A value of 0 means all records have been returned.
|
|
183
195
|
next_record == 0,
|
|
184
|
-
|
|
185
196
|
# Misbehaving CSW server returning a next record > matched count
|
|
186
197
|
next_record > matched_count,
|
|
187
|
-
|
|
188
198
|
# No results returned already
|
|
189
199
|
returned_count == 0,
|
|
190
|
-
|
|
191
200
|
# Current next record is lower than previous one
|
|
192
201
|
next_record < start,
|
|
193
|
-
|
|
194
202
|
# Enough items have been harvested already
|
|
195
|
-
self.max_items and len(self.job.items) >= self.max_items
|
|
203
|
+
self.max_items and len(self.job.items) >= self.max_items,
|
|
196
204
|
)
|
|
197
205
|
|
|
198
206
|
if any(break_conditions):
|
|
@@ -200,16 +208,17 @@ class DcatBackend(BaseBackend):
|
|
|
200
208
|
else:
|
|
201
209
|
return next_record
|
|
202
210
|
|
|
211
|
+
|
|
203
212
|
class CswDcatBackend(DcatBackend):
|
|
204
|
-
display_name =
|
|
213
|
+
display_name = "CSW-DCAT"
|
|
205
214
|
|
|
206
|
-
DCAT_SCHEMA =
|
|
215
|
+
DCAT_SCHEMA = "http://www.w3.org/ns/dcat#"
|
|
207
216
|
|
|
208
217
|
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
|
|
209
218
|
"""
|
|
210
219
|
Yield all RDF pages as `Graph` from the source
|
|
211
220
|
"""
|
|
212
|
-
body =
|
|
221
|
+
body = """<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
213
222
|
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
214
223
|
service="CSW" version="2.0.2" resultType="results"
|
|
215
224
|
startPosition="{start}" maxPosition="200"
|
|
@@ -223,23 +232,24 @@ class CswDcatBackend(DcatBackend):
|
|
|
223
232
|
</ogc:SortProperty>
|
|
224
233
|
</ogc:SortBy>
|
|
225
234
|
</csw:Query>
|
|
226
|
-
</csw:GetRecords>
|
|
227
|
-
headers = {
|
|
235
|
+
</csw:GetRecords>"""
|
|
236
|
+
headers = {"Content-Type": "application/xml"}
|
|
228
237
|
|
|
229
238
|
page_number = 0
|
|
230
239
|
start = 1
|
|
231
240
|
|
|
232
|
-
response = self.post(
|
|
233
|
-
|
|
241
|
+
response = self.post(
|
|
242
|
+
url, data=body.format(start=start, schema=self.DCAT_SCHEMA), headers=headers
|
|
243
|
+
)
|
|
234
244
|
response.raise_for_status()
|
|
235
245
|
content = response.content
|
|
236
246
|
tree = ET.fromstring(content)
|
|
237
|
-
if tree.tag ==
|
|
238
|
-
raise ValueError(f
|
|
247
|
+
if tree.tag == "{" + OWS_NAMESPACE + "}ExceptionReport":
|
|
248
|
+
raise ValueError(f"Failed to query CSW:\n{content}")
|
|
239
249
|
while tree:
|
|
240
|
-
search_results = tree.find(
|
|
250
|
+
search_results = tree.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
|
|
241
251
|
if search_results is None:
|
|
242
|
-
log.error(f
|
|
252
|
+
log.error(f"No search results found for {url} on page {page_number}")
|
|
243
253
|
break
|
|
244
254
|
for child in search_results:
|
|
245
255
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
@@ -257,19 +267,21 @@ class CswDcatBackend(DcatBackend):
|
|
|
257
267
|
page_number += 1
|
|
258
268
|
|
|
259
269
|
tree = ET.fromstring(
|
|
260
|
-
self.post(
|
|
261
|
-
|
|
270
|
+
self.post(
|
|
271
|
+
url, data=body.format(start=start, schema=self.DCAT_SCHEMA), headers=headers
|
|
272
|
+
).content
|
|
273
|
+
)
|
|
262
274
|
|
|
263
275
|
|
|
264
276
|
class CswIso19139DcatBackend(DcatBackend):
|
|
265
|
-
|
|
277
|
+
"""
|
|
266
278
|
An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
|
|
267
279
|
The parsing of items is then the same as for the DcatBackend.
|
|
268
|
-
|
|
280
|
+
"""
|
|
269
281
|
|
|
270
|
-
display_name =
|
|
282
|
+
display_name = "CSW-ISO-19139"
|
|
271
283
|
|
|
272
|
-
ISO_SCHEMA =
|
|
284
|
+
ISO_SCHEMA = "http://www.isotc211.org/2005/gmd"
|
|
273
285
|
|
|
274
286
|
XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
|
|
275
287
|
|
|
@@ -287,7 +299,7 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
287
299
|
|
|
288
300
|
# Start querying and parsing graph
|
|
289
301
|
# Filter on dataset or serie records
|
|
290
|
-
body =
|
|
302
|
+
body = """<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
|
|
291
303
|
xmlns:gmd="http://www.isotc211.org/2005/gmd"
|
|
292
304
|
service="CSW" version="2.0.2" resultType="results"
|
|
293
305
|
startPosition="{start}" maxPosition="10"
|
|
@@ -313,14 +325,15 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
313
325
|
</ogc:Filter>
|
|
314
326
|
</csw:Constraint>
|
|
315
327
|
</csw:Query>
|
|
316
|
-
</csw:GetRecords>
|
|
317
|
-
headers = {
|
|
328
|
+
</csw:GetRecords>"""
|
|
329
|
+
headers = {"Content-Type": "application/xml"}
|
|
318
330
|
|
|
319
331
|
page_number = 0
|
|
320
332
|
start = 1
|
|
321
333
|
|
|
322
|
-
response = self.post(
|
|
323
|
-
|
|
334
|
+
response = self.post(
|
|
335
|
+
url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers
|
|
336
|
+
)
|
|
324
337
|
response.raise_for_status()
|
|
325
338
|
|
|
326
339
|
tree_before_transform = ET.fromstring(response.content)
|
|
@@ -331,9 +344,9 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
331
344
|
while tree:
|
|
332
345
|
# We query the tree before the transformation because the XSLT remove the search results
|
|
333
346
|
# infos (useful for pagination)
|
|
334
|
-
search_results = tree_before_transform.find(
|
|
347
|
+
search_results = tree_before_transform.find("csw:SearchResults", {"csw": CSW_NAMESPACE})
|
|
335
348
|
if search_results is None:
|
|
336
|
-
log.error(f
|
|
349
|
+
log.error(f"No search results found for {url} on page {page_number}")
|
|
337
350
|
break
|
|
338
351
|
|
|
339
352
|
subgraph = Graph(namespace_manager=namespace_manager)
|
|
@@ -353,8 +366,9 @@ class CswIso19139DcatBackend(DcatBackend):
|
|
|
353
366
|
start = next_record
|
|
354
367
|
page_number += 1
|
|
355
368
|
|
|
356
|
-
response = self.post(
|
|
357
|
-
|
|
369
|
+
response = self.post(
|
|
370
|
+
url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers
|
|
371
|
+
)
|
|
358
372
|
response.raise_for_status()
|
|
359
373
|
|
|
360
374
|
tree_before_transform = ET.fromstring(response.content)
|
udata/harvest/commands.py
CHANGED
|
@@ -6,155 +6,151 @@ from udata.commands import cli
|
|
|
6
6
|
|
|
7
7
|
from . import actions
|
|
8
8
|
|
|
9
|
-
|
|
10
9
|
log = logging.getLogger(__name__)
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
@cli.group(
|
|
12
|
+
@cli.group("harvest")
|
|
14
13
|
def grp():
|
|
15
|
-
|
|
14
|
+
"""Remote repositories harvesting operations"""
|
|
16
15
|
pass
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
@grp.command()
|
|
20
|
-
@click.argument(
|
|
21
|
-
@click.argument(
|
|
22
|
-
@click.argument(
|
|
23
|
-
@click.option(
|
|
24
|
-
@click.option(
|
|
25
|
-
@click.option(
|
|
19
|
+
@click.argument("backend")
|
|
20
|
+
@click.argument("url")
|
|
21
|
+
@click.argument("name")
|
|
22
|
+
@click.option("-f", "--frequency", default=None)
|
|
23
|
+
@click.option("-u", "--owner", default=None)
|
|
24
|
+
@click.option("-o", "--org", default=None)
|
|
26
25
|
def create(name, url, backend, frequency=None, owner=None, org=None):
|
|
27
|
-
|
|
26
|
+
"""Create a new harvest source"""
|
|
28
27
|
log.info('Creating a new Harvest source "%s"', name)
|
|
29
|
-
source = actions.create_source(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
28
|
+
source = actions.create_source(
|
|
29
|
+
name, url, backend, frequency=frequency, owner=owner, organization=org
|
|
30
|
+
)
|
|
31
|
+
log.info(
|
|
32
|
+
"""Created a new Harvest source:
|
|
34
33
|
name: {0.name},
|
|
35
34
|
slug: {0.slug},
|
|
36
35
|
url: {0.url},
|
|
37
36
|
backend: {0.backend},
|
|
38
37
|
frequency: {0.frequency},
|
|
39
38
|
owner: {0.owner},
|
|
40
|
-
organization: {0.organization}
|
|
39
|
+
organization: {0.organization}""".format(source)
|
|
40
|
+
)
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
@grp.command()
|
|
44
|
-
@click.argument(
|
|
44
|
+
@click.argument("identifier")
|
|
45
45
|
def validate(identifier):
|
|
46
|
-
|
|
46
|
+
"""Validate a source given its identifier"""
|
|
47
47
|
source = actions.validate_source(identifier)
|
|
48
|
-
log.info(
|
|
48
|
+
log.info("Source %s (%s) has been validated", source.slug, str(source.id))
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
@grp.command()
|
|
52
52
|
def delete(identifier):
|
|
53
|
-
|
|
53
|
+
"""Delete a harvest source"""
|
|
54
54
|
log.info('Deleting source "%s"', identifier)
|
|
55
55
|
actions.delete_source(identifier)
|
|
56
56
|
log.info('Deleted source "%s"', identifier)
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
@grp.command()
|
|
60
|
-
@click.argument(
|
|
60
|
+
@click.argument("identifier")
|
|
61
61
|
def clean(identifier):
|
|
62
|
-
|
|
62
|
+
"""Delete all datasets linked to a harvest source"""
|
|
63
63
|
log.info(f'Cleaning source "{identifier}"')
|
|
64
64
|
num_of_datasets = actions.clean_source(identifier)
|
|
65
65
|
log.info(f'Cleaned source "{identifier}" - deleted {num_of_datasets} dataset(s)')
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
@grp.command()
|
|
69
|
-
@click.option(
|
|
70
|
-
help='list only scheduled source')
|
|
69
|
+
@click.option("-s", "--scheduled", is_flag=True, help="list only scheduled source")
|
|
71
70
|
def sources(scheduled=False):
|
|
72
|
-
|
|
71
|
+
"""List all harvest sources"""
|
|
73
72
|
sources = actions.list_sources()
|
|
74
73
|
if scheduled:
|
|
75
74
|
sources = [s for s in sources if s.periodic_task]
|
|
76
75
|
if sources:
|
|
77
76
|
for source in sources:
|
|
78
|
-
msg =
|
|
77
|
+
msg = "{source.name} ({source.backend}): {cron}"
|
|
79
78
|
if source.periodic_task:
|
|
80
79
|
cron = source.periodic_task.schedule_display
|
|
81
80
|
else:
|
|
82
|
-
cron =
|
|
81
|
+
cron = "not scheduled"
|
|
83
82
|
log.info(msg.format(source=source, cron=cron))
|
|
84
83
|
elif scheduled:
|
|
85
|
-
log.info(
|
|
84
|
+
log.info("No sources scheduled yet")
|
|
86
85
|
else:
|
|
87
|
-
log.info(
|
|
86
|
+
log.info("No sources defined yet")
|
|
88
87
|
|
|
89
88
|
|
|
90
89
|
@grp.command()
|
|
91
90
|
def backends():
|
|
92
|
-
|
|
93
|
-
log.info(
|
|
91
|
+
"""List available backends"""
|
|
92
|
+
log.info("Available backends:")
|
|
94
93
|
for backend in actions.list_backends():
|
|
95
|
-
log.info(
|
|
94
|
+
log.info("%s (%s)", backend.name, backend.display_name or backend.name)
|
|
96
95
|
|
|
97
96
|
|
|
98
97
|
@grp.command()
|
|
99
|
-
@click.argument(
|
|
98
|
+
@click.argument("identifier")
|
|
100
99
|
def launch(identifier):
|
|
101
|
-
|
|
100
|
+
"""Launch a source harvesting on the workers"""
|
|
102
101
|
log.info('Launching harvest job for source "%s"', identifier)
|
|
103
102
|
actions.launch(identifier)
|
|
104
103
|
|
|
105
104
|
|
|
106
105
|
@grp.command()
|
|
107
|
-
@click.argument(
|
|
106
|
+
@click.argument("identifier")
|
|
108
107
|
def run(identifier):
|
|
109
|
-
|
|
108
|
+
"""Run a harvester synchronously"""
|
|
110
109
|
log.info('Harvesting source "%s"', identifier)
|
|
111
110
|
actions.run(identifier)
|
|
112
111
|
|
|
113
112
|
|
|
114
113
|
@grp.command()
|
|
115
|
-
@click.argument(
|
|
116
|
-
@click.option(
|
|
117
|
-
|
|
118
|
-
@click.option(
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@click.option(
|
|
123
|
-
help='The crontab expression for day of month')
|
|
124
|
-
@click.option('-M', '--month-of-year', default='*',
|
|
125
|
-
help='The crontab expression for month of year')
|
|
114
|
+
@click.argument("identifier")
|
|
115
|
+
@click.option("-m", "--minute", default="*", help="The crontab expression for minute")
|
|
116
|
+
@click.option("-h", "--hour", default="*", help="The crontab expression for hour")
|
|
117
|
+
@click.option(
|
|
118
|
+
"-d", "--day", "day_of_week", default="*", help="The crontab expression for day of week"
|
|
119
|
+
)
|
|
120
|
+
@click.option("-D", "--day-of-month", default="*", help="The crontab expression for day of month")
|
|
121
|
+
@click.option("-M", "--month-of-year", default="*", help="The crontab expression for month of year")
|
|
126
122
|
def schedule(identifier, **kwargs):
|
|
127
|
-
|
|
123
|
+
"""Schedule a harvest job to run periodically"""
|
|
128
124
|
source = actions.schedule(identifier, **kwargs)
|
|
129
|
-
msg =
|
|
125
|
+
msg = "Scheduled {source.name} with the following crontab: {cron}"
|
|
130
126
|
log.info(msg.format(source=source, cron=source.periodic_task.crontab))
|
|
131
127
|
|
|
132
128
|
|
|
133
129
|
@grp.command()
|
|
134
|
-
@click.argument(
|
|
130
|
+
@click.argument("identifier")
|
|
135
131
|
def unschedule(identifier):
|
|
136
|
-
|
|
132
|
+
"""Unschedule a periodical harvest job"""
|
|
137
133
|
source = actions.unschedule(identifier)
|
|
138
134
|
log.info('Unscheduled harvest source "%s"', source.name)
|
|
139
135
|
|
|
140
136
|
|
|
141
137
|
@grp.command()
|
|
142
138
|
def purge():
|
|
143
|
-
|
|
144
|
-
log.info(
|
|
139
|
+
"""Permanently remove deleted harvest sources"""
|
|
140
|
+
log.info("Purging deleted harvest sources")
|
|
145
141
|
count = actions.purge_sources()
|
|
146
|
-
log.info(
|
|
142
|
+
log.info("Purged %s source(s)", count)
|
|
147
143
|
|
|
148
144
|
|
|
149
145
|
@grp.command()
|
|
150
|
-
@click.argument(
|
|
151
|
-
@click.argument(
|
|
146
|
+
@click.argument("filename")
|
|
147
|
+
@click.argument("domain")
|
|
152
148
|
def attach(domain, filename):
|
|
153
|
-
|
|
149
|
+
"""
|
|
154
150
|
Attach existing datasets to their harvest remote id
|
|
155
151
|
|
|
156
152
|
Mapping between identifiers should be in FILENAME CSV file.
|
|
157
|
-
|
|
158
|
-
log.info(
|
|
153
|
+
"""
|
|
154
|
+
log.info("Attaching datasets for domain %s", domain)
|
|
159
155
|
result = actions.attach(domain, filename)
|
|
160
|
-
log.info(
|
|
156
|
+
log.info("Attached %s datasets to %s", result.success, domain)
|
udata/harvest/csv.py
CHANGED
|
@@ -6,12 +6,12 @@ from .models import HarvestSource
|
|
|
6
6
|
@csv.adapter(HarvestSource)
|
|
7
7
|
class HarvestSourceCsvAdapter(csv.Adapter):
|
|
8
8
|
fields = (
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
(
|
|
13
|
-
(
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
(
|
|
9
|
+
"id",
|
|
10
|
+
"name",
|
|
11
|
+
"url",
|
|
12
|
+
("organization", "organization.name"),
|
|
13
|
+
("organization_id", "organization.id"),
|
|
14
|
+
"backend",
|
|
15
|
+
"created_at",
|
|
16
|
+
("validation", lambda o: o.validation.state),
|
|
17
17
|
)
|
udata/harvest/exceptions.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
class HarvestException(Exception):
|
|
2
|
-
|
|
2
|
+
"""Base class for all harvest exception"""
|
|
3
|
+
|
|
3
4
|
pass
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class HarvestSkipException(HarvestException):
|
|
7
|
-
|
|
8
|
+
"""Raised when an item is skipped"""
|
|
9
|
+
|
|
8
10
|
pass
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class HarvestValidationError(HarvestException):
|
|
12
|
-
|
|
14
|
+
"""Raised when an harvested item is invalid"""
|
|
15
|
+
|
|
13
16
|
pass
|