PyPI - udata - Versions diffs - 10.8.1.dev36703__py2.py3-none-any.whl → 10.8.2__py2.py3-none-any.whl - Mend

udata 10.8.1.dev36703py2.py3-none-any.whl → 10.8.2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of udata might be problematic. Click here for more details.

Files changed (79) hide show

udata/__init__.py +1 -1
udata/app.py +0 -2
udata/commands/db.py +22 -9
udata/core/dataset/models.py +5 -3
udata/core/discussions/api.py +2 -2
udata/core/jobs/api.py +3 -3
udata/core/metrics/helpers.py +10 -0
udata/core/metrics/tasks.py +144 -1
udata/core/organization/api.py +2 -2
udata/core/post/api.py +1 -1
udata/core/user/api.py +1 -1
udata/features/identicon/api.py +1 -1
udata/harvest/actions.py +24 -28
udata/harvest/api.py +28 -36
udata/harvest/backends/ckan/__init__.py +3 -0
udata/harvest/backends/ckan/harvesters.py +274 -0
udata/harvest/backends/ckan/schemas/__init__.py +0 -0
udata/harvest/backends/ckan/schemas/ckan.py +86 -0
udata/harvest/backends/ckan/schemas/dkan.py +98 -0
udata/harvest/commands.py +7 -7
udata/harvest/tasks.py +1 -1
udata/harvest/tests/ckan/conftest.py +67 -0
udata/harvest/tests/ckan/data/dkan-french-w-license.json +226 -0
udata/harvest/tests/ckan/test_ckan_backend.py +697 -0
udata/harvest/tests/ckan/test_ckan_backend_errors.py +140 -0
udata/harvest/tests/ckan/test_ckan_backend_filters.py +130 -0
udata/harvest/tests/ckan/test_dkan_backend.py +68 -0
udata/harvest/tests/test_actions.py +27 -32
udata/harvest/tests/test_api.py +23 -18
udata/harvest/tests/test_dcat_backend.py +29 -29
udata/migrations/2025-07-30-purge-old-harvest-dynamic-fields.py +29 -0
udata/mongo/slug_fields.py +1 -1
udata/routing.py +6 -0
udata/static/chunks/{11.b6f741fcc366abfad9c4.js → 11.51d706fb9521c16976bc.js} +3 -3
udata/static/chunks/{11.b6f741fcc366abfad9c4.js.map → 11.51d706fb9521c16976bc.js.map} +1 -1
udata/static/chunks/{13.2d06442dd9a05d9777b5.js → 13.39e106d56f794ebd06a0.js} +2 -2
udata/static/chunks/{13.2d06442dd9a05d9777b5.js.map → 13.39e106d56f794ebd06a0.js.map} +1 -1
udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js → 17.70cbb4a91b002338007e.js} +2 -2
udata/static/chunks/{17.e8e4caaad5cb0cc0bacc.js.map → 17.70cbb4a91b002338007e.js.map} +1 -1
udata/static/chunks/{19.f03a102365af4315f9db.js → 19.a348a5fff8fe2801e52a.js} +3 -3
udata/static/chunks/{19.f03a102365af4315f9db.js.map → 19.a348a5fff8fe2801e52a.js.map} +1 -1
udata/static/chunks/{5.0fa1408dae4e76b87b2e.js → 5.343ca020a2d38cec1a14.js} +3 -3
udata/static/chunks/{5.0fa1408dae4e76b87b2e.js.map → 5.343ca020a2d38cec1a14.js.map} +1 -1
udata/static/chunks/{6.d663709d877baa44a71e.js → 6.a3b07de9dd2ca2d24e85.js} +3 -3
udata/static/chunks/{6.d663709d877baa44a71e.js.map → 6.a3b07de9dd2ca2d24e85.js.map} +1 -1
udata/static/chunks/{8.778091d55cd8ea39af6b.js → 8.462bb3029de008497675.js} +2 -2
udata/static/chunks/{8.778091d55cd8ea39af6b.js.map → 8.462bb3029de008497675.js.map} +1 -1
udata/static/common.js +1 -1
udata/static/common.js.map +1 -1
udata/tests/api/test_datasets_api.py +0 -46
udata/tests/api/test_organizations_api.py +5 -0
udata/tests/cli/test_db_cli.py +12 -0
udata/tests/dataset/test_dataset_model.py +0 -16
udata/tests/metrics/__init__.py +0 -0
udata/tests/metrics/conftest.py +15 -0
udata/tests/metrics/helpers.py +58 -0
udata/tests/metrics/test_metrics.py +67 -0
udata/tests/metrics/test_tasks.py +171 -0
udata/translations/ar/LC_MESSAGES/udata.mo +0 -0
udata/translations/ar/LC_MESSAGES/udata.po +72 -65
udata/translations/de/LC_MESSAGES/udata.mo +0 -0
udata/translations/de/LC_MESSAGES/udata.po +72 -65
udata/translations/es/LC_MESSAGES/udata.mo +0 -0
udata/translations/es/LC_MESSAGES/udata.po +72 -65
udata/translations/fr/LC_MESSAGES/udata.mo +0 -0
udata/translations/fr/LC_MESSAGES/udata.po +72 -65
udata/translations/it/LC_MESSAGES/udata.mo +0 -0
udata/translations/it/LC_MESSAGES/udata.po +72 -65
udata/translations/pt/LC_MESSAGES/udata.mo +0 -0
udata/translations/pt/LC_MESSAGES/udata.po +72 -65
udata/translations/sr/LC_MESSAGES/udata.mo +0 -0
udata/translations/sr/LC_MESSAGES/udata.po +72 -65
udata/translations/udata.pot +74 -70
{udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/METADATA +15 -2
{udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/RECORD +79 -62
{udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/entry_points.txt +2 -0
{udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/LICENSE +0 -0
{udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/WHEEL +0 -0
{udata-10.8.1.dev36703.dist-info → udata-10.8.2.dist-info}/top_level.txt +0 -0

udata/__init__.py CHANGED Viewed

@@ -4,5 +4,5 @@
 udata
 """
-__version__ = "10.8.1.dev"
+__version__ = "10.8.2"
 __description__ = "Open data portal"

udata/app.py CHANGED Viewed

@@ -210,7 +210,6 @@ def register_extensions(app):
         routing,
         search,
         sentry,
-        sitemap,
         tasks,
     )
@@ -225,7 +224,6 @@ def register_extensions(app):
     csrf.init_app(app)
     mail.init_app(app)
     search.init_app(app)
-    sitemap.init_app(app)
     sentry.init_app(app)
     return app

udata/commands/db.py CHANGED Viewed

@@ -2,6 +2,7 @@ import collections
 import copy
 import logging
 import os
+import sys
 import traceback
 from itertools import groupby
 from typing import Optional
@@ -312,15 +313,26 @@ def check_references(models_to_check):
                                     f"\t{model.__name__}#{obj.id} have a broken reference for `{reference['name']}`"
                                 )
                         elif reference["type"] == "list":
-                            attr_list = getattr(obj, reference["name"], [])
-                            for i, sub in enumerate(attr_list):
-                                # If it's still an instance of DBRef it means that it failed to
-                                # dereference the ID.
-                                if isinstance(sub, DBRef):
-                                    errors[model][key] += 1
-                                    print_and_save(
-                                        f"\t{model.__name__}#{obj.id} have a broken reference for {reference['name']}[{i}]"
-                                    )
+                            field_exists = (
+                                f"{reference['name']}__exists"  # Eg: "contact_points__exists"
+                            )
+                            if model.objects(id=obj.id, **{field_exists: True}).count() == 0:
+                                # See https://github.com/MongoEngine/mongoengine/issues/267#issuecomment-283065318
+                                # Setting it explicitely to an empty list actually removes the field, it shouldn't.
+                                errors[model][key] += 1
+                                print_and_save(
+                                    f"\t{model.__name__}#{obj.id} have a non existing field `{reference['name']}`, instead of an empty list"
+                                )
+                            else:
+                                attr_list = getattr(obj, reference["name"])
+                                for i, sub in enumerate(attr_list):
+                                    # If it's still an instance of DBRef it means that it failed to
+                                    # dereference the ID.
+                                    if isinstance(sub, DBRef):
+                                        errors[model][key] += 1
+                                        print_and_save(
+                                            f"\t{model.__name__}#{obj.id} have a broken reference for {reference['name']}[{i}]"
+                                        )
                         elif reference["type"] == "embed_list":
                             p1, p2 = reference["name"].split("__")
                             attr_list = getattr(obj, p1, [])
@@ -380,6 +392,7 @@ def check_references(models_to_check):
                 sentry_sdk.capture_message(f"{total} integrity errors", "fatal")
         except ImportError:
             print("`sentry_sdk` not installed. The errors weren't reported")
+        sys.exit(1)
 @grp.command()

udata/core/dataset/models.py CHANGED Viewed

@@ -10,7 +10,6 @@ import requests
 from blinker import signal
 from dateutil.parser import parse as parse_dt
 from flask import current_app, url_for
-from mongoengine import DynamicEmbeddedDocument
 from mongoengine import ValidationError as MongoEngineValidationError
 from mongoengine.fields import DateTimeField
 from mongoengine.signals import post_save, pre_init, pre_save
@@ -78,7 +77,7 @@ def get_json_ld_extra(key, value):
     }
-class HarvestDatasetMetadata(DynamicEmbeddedDocument):
+class HarvestDatasetMetadata(db.EmbeddedDocument):
     backend = db.StringField()
     created_at = db.DateTimeField()
     modified_at = db.DateTimeField()
@@ -91,12 +90,15 @@ class HarvestDatasetMetadata(DynamicEmbeddedDocument):
     dct_identifier = db.StringField()
     archived_at = db.DateTimeField()
     archived = db.StringField()
+    ckan_name = db.StringField()
+    ckan_source = db.StringField()
-class HarvestResourceMetadata(DynamicEmbeddedDocument):
+class HarvestResourceMetadata(db.EmbeddedDocument):
     created_at = db.DateTimeField()
     modified_at = db.DateTimeField()
     uri = db.StringField()
+    dct_identifier = db.StringField()
 class Schema(db.EmbeddedDocument):

udata/core/discussions/api.py CHANGED Viewed

@@ -247,7 +247,7 @@ class DiscussionAPI(API):
         return "", 204
-@ns.route("/<id>/comments/<int:cidx>/spam", endpoint="discussion_comment_spam")
+@ns.route("/<id>/comments/<int:cidx>/spam/", endpoint="discussion_comment_spam")
 @ns.doc(delete={"id": "unspam"})
 class DiscussionCommentSpamAPI(SpamAPIMixin):
     def get_model(self, id, cidx):
@@ -259,7 +259,7 @@ class DiscussionCommentSpamAPI(SpamAPIMixin):
         return discussion, discussion.discussion[cidx]
-@ns.route("/<id>/comments/<int:cidx>", endpoint="discussion_comment")
+@ns.route("/<id>/comments/<int:cidx>/", endpoint="discussion_comment")
 class DiscussionCommentAPI(API):
     """
     Base class for a comment in a discussion thread.

udata/core/jobs/api.py CHANGED Viewed

@@ -101,7 +101,7 @@ class JobsAPI(API):
         return form.save(), 201
-@ns.route("/jobs/<string:id>", endpoint="job")
+@ns.route("/jobs/<string:id>/", endpoint="job")
 @api.param("id", "A job ID")
 class JobAPI(API):
     def get_or_404(self, id):
@@ -139,7 +139,7 @@ class JobAPI(API):
         return "", 204
-@ns.route("/tasks/<string:id>", endpoint="task")
+@ns.route("/tasks/<string:id>/", endpoint="task")
 class TaskAPI(API):
     @api.marshal_with(task_fields)
     def get(self, id):
@@ -159,7 +159,7 @@ class TaskAPI(API):
         return data
-@ns.route("/jobs/schedulables", endpoint="schedulable_jobs")
+@ns.route("/jobs/schedulables/", endpoint="schedulable_jobs")
 class JobsReferenceAPI(API):
     @api.doc(model=[str])
     def get(self):

udata/core/metrics/helpers.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 from collections import OrderedDict
 from datetime import datetime, timedelta
 from typing import Dict, List, Union
+from urllib.parse import urlencode
 import requests
 from bson import ObjectId
@@ -68,6 +69,15 @@ def get_metrics_for_model(
         return [{} for _ in range(len(metrics_labels))]
+def get_download_url(model: str, id: Union[str, ObjectId, None]) -> str:
+    api_namespace = model + "s" if model != "site" else model
+    base_url = f"{current_app.config['METRICS_API']}/{api_namespace}/data/csv/"
+    args = {"metric_month__sort": "asc"}
+    if id:
+        args[f"{model}_id__exact"] = id
+    return f"{base_url}?{urlencode(args)}"
 def compute_monthly_aggregated_metrics(aggregation_res: CommandCursor) -> OrderedDict:
     monthly_metrics = OrderedDict((month, 0) for month in get_last_13_months())
     for monthly_count in aggregation_res:

udata/core/metrics/tasks.py CHANGED Viewed

@@ -1,9 +1,152 @@
+import logging
+import time
+from functools import wraps
+from typing import Dict, List
+import requests
 from flask import current_app
+from udata.core.dataservices.models import Dataservice
 from udata.core.metrics.signals import on_site_metrics_computed
-from udata.models import Site
+from udata.models import CommunityResource, Dataset, Organization, Reuse, Site, db
 from udata.tasks import job
+log = logging.getLogger(__name__)
+def log_timing(func):
+    @wraps(func)
+    def timeit_wrapper(*args, **kwargs):
+        # Better log if we're using Python 3.9
+        name = func.__name__
+        model = name.removeprefix("update_") if hasattr(name, "removeprefix") else name
+        log.info(f"Processing {model}…")
+        start_time = time.perf_counter()
+        result = func(*args, **kwargs)
+        total_time = time.perf_counter() - start_time
+        log.info(f"Done in {total_time:.4f} seconds.")
+        return result
+    return timeit_wrapper
+def save_model(model: db.Document, model_id: str, metrics: Dict[str, int]) -> None:
+    try:
+        result = model.objects(id=model_id).update(
+            **{f"set__metrics__{key}": value for key, value in metrics.items()}
+        )
+        if result is None:
+            log.debug(f"{model.__name__} not found", extra={"id": model_id})
+    except Exception as e:
+        log.exception(e)
+def iterate_on_metrics(target: str, value_keys: List[str], page_size: int = 50) -> dict:
+    """
+    Yield all elements with not zero values for the keys inside `value_keys`.
+    If you pass ['visit', 'download_resource'], it will do a `OR` and get
+    metrics with one of the two values not zero.
+    """
+    yielded = set()
+    for value_key in value_keys:
+        url = f"{current_app.config['METRICS_API']}/{target}_total/data/"
+        url += f"?{value_key}__greater=1&page_size={page_size}"
+        with requests.Session() as session:
+            while url is not None:
+                r = session.get(url, timeout=10)
+                r.raise_for_status()
+                data = r.json()
+                for row in data["data"]:
+                    if row["__id"] not in yielded:
+                        yielded.add(row["__id"])
+                        yield row
+                url = data["links"].get("next")
+@log_timing
+def update_resources_and_community_resources():
+    for data in iterate_on_metrics("resources", ["download_resource"]):
+        if data["dataset_id"] is None:
+            save_model(
+                CommunityResource,
+                data["resource_id"],
+                {
+                    "views": data["download_resource"],
+                },
+            )
+        else:
+            Dataset.objects(resources__id=data["resource_id"]).update(
+                **{"set__resources__$__metrics__views": data["download_resource"]}
+            )
+@log_timing
+def update_datasets():
+    for data in iterate_on_metrics("datasets", ["visit", "download_resource"]):
+        save_model(
+            Dataset,
+            data["dataset_id"],
+            {
+                "views": data["visit"],
+                "resources_downloads": data["download_resource"],
+            },
+        )
+@log_timing
+def update_dataservices():
+    for data in iterate_on_metrics("dataservices", ["visit"]):
+        save_model(
+            Dataservice,
+            data["dataservice_id"],
+            {
+                "views": data["visit"],
+            },
+        )
+@log_timing
+def update_reuses():
+    for data in iterate_on_metrics("reuses", ["visit"]):
+        save_model(Reuse, data["reuse_id"], {"views": data["visit"]})
+@log_timing
+def update_organizations():
+    # We're currently using visit_dataset as global metric for an orga
+    for data in iterate_on_metrics("organizations", ["visit_dataset"]):
+        save_model(
+            Organization,
+            data["organization_id"],
+            {
+                "views": data["visit_dataset"],
+            },
+        )
+def update_metrics_for_models():
+    log.info("Starting…")
+    update_datasets()
+    update_resources_and_community_resources()
+    update_dataservices()
+    update_reuses()
+    update_organizations()
+@job("update-metrics", route="low.metrics")
+def update_metrics(self):
+    """Update udata objects metrics"""
+    if not current_app.config["METRICS_API"]:
+        log.error("You need to set METRICS_API to run update-metrics")
+        exit(1)
+    update_metrics_for_models()
 @job("compute-site-metrics")
 def compute_site_metrics(self):

udata/core/organization/api.py CHANGED Viewed

@@ -99,7 +99,7 @@ class OrgApiParser(ModelApiParser):
         if args.get("badge"):
             organizations = organizations.with_badge(args["badge"])
         if args.get("name"):
-            organizations = organizations.filter(name=args["name"])
+            organizations = organizations.filter(name__iexact=args["name"])
         if args.get("business_number_id"):
             organizations = organizations.filter(business_number_id=args["business_number_id"])
         return organizations
@@ -449,7 +449,7 @@ class MembershipRefuseAPI(MembershipAPI):
         return {}, 200
-@ns.route("/<org:org>/member/<user:user>", endpoint="member", doc=common_doc)
+@ns.route("/<org:org>/member/<user:user>/", endpoint="member", doc=common_doc)
 class MemberAPI(API):
     @api.secure
     @api.expect(member_fields)

udata/core/post/api.py CHANGED Viewed

@@ -168,7 +168,7 @@ class PostAPI(API):
         return "", 204
-@ns.route("/<post:post>/publish", endpoint="publish_post")
+@ns.route("/<post:post>/publish/", endpoint="publish_post")
 class PublishPostAPI(API):
     @api.secure(admin_permission)
     @api.doc("publish_post")

udata/core/user/api.py CHANGED Viewed

@@ -194,7 +194,7 @@ class MyOrgDiscussionsAPI(API):
         return list(discussions)
-@me.route("/apikey", endpoint="my_apikey")
+@me.route("/apikey/", endpoint="my_apikey")
 class ApiKeyAPI(API):
     @api.secure
     @api.doc("generate_apikey")

udata/features/identicon/api.py CHANGED Viewed

@@ -5,7 +5,7 @@ from . import backends
 ns = api.namespace("avatars", "Avatars")
-@ns.route("/<identifier>/<int:size>", endpoint="avatar")
+@ns.route("/<identifier>/<int:size>/", endpoint="avatar")
 class IdenticonAPI(API):
     @api.doc("avatars")
     def get(self, identifier, size):

udata/harvest/actions.py CHANGED Viewed

@@ -29,6 +29,11 @@ log = logging.getLogger(__name__)
 DEFAULT_PAGE_SIZE = 10
+def get_source(ident):
+    """Get an harvest source given its ID or its slug"""
+    return HarvestSource.get(ident)
 def list_backends():
     """List all available backends"""
     return backends.get_all(current_app).values()
@@ -44,11 +49,6 @@ def list_sources(owner=None, deleted=False):
     return list(sources)
-def get_source(ident):
-    """Get an harvest source given its ID or its slug"""
-    return HarvestSource.get(ident)
 def get_job(ident):
     """Get an harvest job given its ID"""
     return HarvestJob.objects.get(id=ident)
@@ -89,31 +89,28 @@ def create_source(
     return source
-def update_source(ident, data):
+def update_source(source: HarvestSource, data):
     """Update an harvest source"""
-    source = get_source(ident)
     source.modify(**data)
     signals.harvest_source_updated.send(source)
     return source
-def validate_source(ident, comment=None):
+def validate_source(source: HarvestSource, comment=None):
     """Validate a source for automatic harvesting"""
-    source = get_source(ident)
     source.validation.on = datetime.utcnow()
     source.validation.comment = comment
     source.validation.state = VALIDATION_ACCEPTED
     if current_user.is_authenticated:
         source.validation.by = current_user._get_current_object()
     source.save()
-    schedule(ident, cron=current_app.config["HARVEST_DEFAULT_SCHEDULE"])
-    launch(ident)
+    schedule(source, cron=current_app.config["HARVEST_DEFAULT_SCHEDULE"])
+    launch(source)
     return source
-def reject_source(ident, comment):
+def reject_source(source: HarvestSource, comment):
     """Reject a source for automatic harvesting"""
-    source = get_source(ident)
     source.validation.on = datetime.utcnow()
     source.validation.comment = comment
     source.validation.state = VALIDATION_REFUSED
@@ -123,18 +120,16 @@ def reject_source(ident, comment):
     return source
-def delete_source(ident):
+def delete_source(source: HarvestSource):
     """Delete an harvest source"""
-    source = get_source(ident)
     source.deleted = datetime.utcnow()
     source.save()
     signals.harvest_source_deleted.send(source)
     return source
-def clean_source(ident):
+def clean_source(source: HarvestSource):
     """Deletes all datasets linked to a harvest source"""
-    source = get_source(ident)
     datasets = Dataset.objects.filter(harvest__source_id=str(source.id))
     for dataset in datasets:
         dataset.deleted = datetime.utcnow()
@@ -180,22 +175,20 @@ def purge_jobs():
     return HarvestJob.objects(created__lt=expiration).delete()
-def run(ident):
+def run(source: HarvestSource):
     """Launch or resume an harvesting for a given source if none is running"""
-    source = get_source(ident)
     cls = backends.get(current_app, source.backend)
     backend = cls(source)
     backend.harvest()
-def launch(ident):
+def launch(source: HarvestSource):
     """Launch or resume an harvesting for a given source if none is running"""
-    return harvest.delay(ident)
+    return harvest.delay(source.id)
-def preview(ident):
+def preview(source: HarvestSource):
     """Preview an harvesting for a given source"""
-    source = get_source(ident)
     cls = backends.get(current_app, source.backend)
     max_items = current_app.config["HARVEST_PREVIEW_MAX_ITEMS"]
     backend = cls(source, dryrun=True, max_items=max_items)
@@ -240,11 +233,15 @@ def preview_from_config(
 def schedule(
-    ident, cron=None, minute="*", hour="*", day_of_week="*", day_of_month="*", month_of_year="*"
+    source: HarvestSource,
+    cron=None,
+    minute="*",
+    hour="*",
+    day_of_week="*",
+    day_of_month="*",
+    month_of_year="*",
 ):
     """Schedule an harvesting on a source given a crontab"""
-    source = get_source(ident)
     if cron:
         minute, hour, day_of_month, month_of_year, day_of_week = cron.split()
@@ -273,9 +270,8 @@ def schedule(
     return source
-def unschedule(ident):
+def unschedule(source: HarvestSource):
     """Unschedule an harvesting on a source"""
-    source = get_source(ident)
     if not source.periodic_task:
         msg = "Harvesting on source {0} is ot scheduled".format(source.name)
         raise ValueError(msg)

udata 10.8.1.dev36703__py2.py3-none-any.whl → 10.8.2__py2.py3-none-any.whl

Potentially problematic release.

udata 10.8.1.dev36703py2.py3-none-any.whl → 10.8.2py2.py3-none-any.whl