udata 12.0.2.dev9__py3-none-any.whl → 12.0.2.dev11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata might be problematic. Click here for more details.

@@ -186,7 +186,6 @@ MODULES_WITH_COMMANDS = [
186
186
  "core.spatial",
187
187
  "core.user",
188
188
  "harvest",
189
- "linkchecker",
190
189
  "search",
191
190
  ]
192
191
 
udata/core/dataset/api.py CHANGED
@@ -46,7 +46,6 @@ from udata.core.storages.api import handle_upload, upload_parser
46
46
  from udata.core.topic.models import Topic
47
47
  from udata.frontend.markdown import md
48
48
  from udata.i18n import gettext as _
49
- from udata.linkchecker.checker import check_resource
50
49
  from udata.rdf import RDF_EXTENSIONS, graph_response, negociate_content
51
50
  from udata.utils import get_by
52
51
 
@@ -902,20 +901,6 @@ class AllowedExtensionsAPI(API):
902
901
  return sorted(current_app.config["ALLOWED_RESOURCES_EXTENSIONS"])
903
902
 
904
903
 
905
- @ns.route(
906
- "/<dataset:dataset>/resources/<uuid:rid>/check/",
907
- endpoint="check_dataset_resource",
908
- doc=common_doc,
909
- )
910
- @api.param("rid", "The resource unique identifier")
911
- class CheckDatasetResource(API, ResourceMixin):
912
- @api.doc("check_dataset_resource")
913
- def get(self, dataset, rid):
914
- """Checks that a resource's URL exists and returns metadata."""
915
- resource = self.get_resource_or_404(dataset, rid)
916
- return check_resource(resource)
917
-
918
-
919
904
  @ns.route("/resource_types/", endpoint="resource_types")
920
905
  class ResourceTypesAPI(API):
921
906
  @api.doc("resource_types")
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import re
3
- from datetime import datetime, timedelta
3
+ from datetime import datetime
4
4
  from pydoc import locate
5
5
  from typing import Self
6
6
  from urllib.parse import urlparse
@@ -8,7 +8,6 @@ from urllib.parse import urlparse
8
8
  import Levenshtein
9
9
  import requests
10
10
  from blinker import signal
11
- from dateutil.parser import parse as parse_dt
12
11
  from flask import current_app, url_for
13
12
  from mongoengine import ValidationError as MongoEngineValidationError
14
13
  from mongoengine.fields import DateTimeField
@@ -369,7 +368,13 @@ class ResourceMixin(object):
369
368
  mime = db.StringField()
370
369
  filesize = db.IntField() # `size` is a reserved keyword for mongoengine.
371
370
  fs_filename = db.StringField()
372
- extras = db.ExtrasField()
371
+ extras = db.ExtrasField(
372
+ {
373
+ "check:available": db.BooleanField,
374
+ "check:status": db.IntField,
375
+ "check:date": db.DateTimeField,
376
+ }
377
+ )
373
378
  harvest = db.EmbeddedDocumentField(HarvestResourceMetadata)
374
379
  schema = db.EmbeddedDocumentField(Schema)
375
380
 
@@ -428,41 +433,6 @@ class ResourceMixin(object):
428
433
  """
429
434
  return self.extras.get("check:available", "unknown")
430
435
 
431
- def need_check(self):
432
- """Does the resource needs to be checked against its linkchecker?
433
-
434
- We check unavailable resources often, unless they go over the
435
- threshold. Available resources are checked less and less frequently
436
- based on their historical availability.
437
- """
438
- min_cache_duration, max_cache_duration, ko_threshold = [
439
- current_app.config.get(k)
440
- for k in (
441
- "LINKCHECKING_MIN_CACHE_DURATION",
442
- "LINKCHECKING_MAX_CACHE_DURATION",
443
- "LINKCHECKING_UNAVAILABLE_THRESHOLD",
444
- )
445
- ]
446
- count_availability = self.extras.get("check:count-availability", 1)
447
- is_available = self.check_availability()
448
- if is_available == "unknown":
449
- return True
450
- elif is_available or count_availability > ko_threshold:
451
- delta = min(min_cache_duration * count_availability, max_cache_duration)
452
- else:
453
- delta = min_cache_duration
454
- if self.extras.get("check:date"):
455
- limit_date = datetime.utcnow() - timedelta(minutes=delta)
456
- check_date = self.extras["check:date"]
457
- if not isinstance(check_date, datetime):
458
- try:
459
- check_date = parse_dt(check_date)
460
- except (ValueError, TypeError):
461
- return True
462
- if check_date >= limit_date:
463
- return False
464
- return True
465
-
466
436
  @property
467
437
  def latest(self):
468
438
  """
udata/entrypoints.py CHANGED
@@ -4,10 +4,8 @@ import pkg_resources
4
4
  ENTRYPOINTS = {
5
5
  "udata.avatars": "Avatar rendering backends",
6
6
  "udata.harvesters": "Harvest backends",
7
- "udata.linkcheckers": "Link checker backends",
8
7
  "udata.metrics": "Extra metrics",
9
8
  "udata.models": "Models and migrations",
10
- "udata.preview": "Displays preview for resources",
11
9
  "udata.plugins": "Generic plugin",
12
10
  "udata.tasks": "Tasks and jobs",
13
11
  "udata.themes": "Themes",
udata/models/__init__.py CHANGED
@@ -31,8 +31,6 @@ from udata.features.territories.models import * # noqa
31
31
  # Load HarvestSource model as harvest for catalog
32
32
  from udata.harvest.models import HarvestSource as Harvest # noqa
33
33
 
34
- import udata.linkchecker.models # noqa
35
-
36
34
 
37
35
  def init_app(app):
38
36
  entrypoints.get_enabled("udata.models", app)
@@ -11,15 +11,16 @@ ALLOWED_TYPES = (str, int, float, bool, datetime, date, list, dict)
11
11
 
12
12
 
13
13
  class ExtrasField(DictField):
14
- def __init__(self, **kwargs):
14
+ def __init__(self, keys_types={}, **kwargs):
15
15
  self.registered = {}
16
+ for key, dbtype in keys_types.items():
17
+ self.register(key, dbtype)
16
18
  super(ExtrasField, self).__init__()
17
19
 
18
20
  def register(self, key, dbtype):
19
21
  """Register a DB type to add constraint on a given extra key"""
20
22
  if not issubclass(dbtype, (BaseField, EmbeddedDocument)):
21
- msg = "ExtrasField can only register MongoEngine fields"
22
- raise TypeError(msg)
23
+ raise TypeError("ExtrasField can only register MongoEngine fields")
23
24
  self.registered[key] = dbtype
24
25
 
25
26
  def validate(self, values):
udata/settings.py CHANGED
@@ -331,16 +331,6 @@ class Defaults(object):
331
331
  # The order is important to compute parents/children, smaller first.
332
332
  HANDLED_LEVELS = tuple()
333
333
 
334
- LINKCHECKING_ENABLED = True
335
- # Resource types ignored by linkchecker
336
- LINKCHECKING_UNCHECKED_TYPES = ("api",)
337
- LINKCHECKING_IGNORE_DOMAINS = []
338
- LINKCHECKING_IGNORE_PATTERNS = ["format=shp"]
339
- LINKCHECKING_MIN_CACHE_DURATION = 60 # in minutes
340
- LINKCHECKING_MAX_CACHE_DURATION = 1080 # in minutes (1 week)
341
- LINKCHECKING_UNAVAILABLE_THRESHOLD = 100
342
- LINKCHECKING_DEFAULT_LINKCHECKER = "no_check"
343
-
344
334
  # Ignore some endpoint from API tracking
345
335
  # By default ignore the 3 most called APIs
346
336
  TRACKING_BLACKLIST = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: udata
3
- Version: 12.0.2.dev9
3
+ Version: 12.0.2.dev11
4
4
  Summary: Open data portal
5
5
  Author-email: Opendata Team <opendatateam@data.gouv.fr>
6
6
  Maintainer-email: Opendata Team <opendatateam@data.gouv.fr>
@@ -2,7 +2,7 @@ udata/__init__.py,sha256=U0HEYqKCLOY43O1UCVeuAb3b3SSX1pPhsJGpHJmK67k,75
2
2
  udata/api_fields.py,sha256=XI0XoM1fxO4DEzxGptOAB5SL_fJr-u58-bfQVAvzgBg,36549
3
3
  udata/app.py,sha256=By-eZvSVSCNtWeKm_lA8TF81qaHdzScvggvbgHCDHYI,8992
4
4
  udata/cors.py,sha256=JttAogsNVSFWEV9-1L2kdbwwsNewn3KjgBErXReNwfc,3801
5
- udata/entrypoints.py,sha256=mbAAUVT8ZenzSYdang2PbAwZcK1pENtA3axBmPRiWCw,2717
5
+ udata/entrypoints.py,sha256=90n21thjq-r7EEyWf6IJZ6DS1q404OoP_VDkpRjZMvs,2611
6
6
  udata/errors.py,sha256=E8W7b4PH7c5B85g_nsUMt8fHqMVpDFOZFkO6wMPl6bA,117
7
7
  udata/factories.py,sha256=MoklZnU8iwNL25dm3JsoXhoQs1PQWSVYL1WvcUBtJqM,492
8
8
  udata/i18n.py,sha256=bC9ajf66YgcYoJffvresLZLa32rb6NsY-JGMtFiVsG4,8163
@@ -10,7 +10,7 @@ udata/mail.py,sha256=Huhx_1QthJkLvuRUuP6jqb5Qq5R4iSmqeEpLVO9ZkQ4,2671
10
10
  udata/rdf.py,sha256=aJKmnE1r6YyMKXLo-VRlUvOXoZSJuvNeNbMCqEl0kdY,19370
11
11
  udata/routing.py,sha256=Hnc1ktmKVS-RUHNKw2zYTft2HJ903FhjtlcenQ9igwI,8044
12
12
  udata/sentry.py,sha256=ekcxqUSqxfM98TtvCsPaOoX5i2l6PEcYt7kb4l3od-Q,3223
13
- udata/settings.py,sha256=o6sV0gAXgZY4if2pU017yR0pKwDTEFBFbnEWYDyBxZg,22110
13
+ udata/settings.py,sha256=1gDu1fnorsgzRzkMIEzFjWiVZ_7UPe1kRW-GQulb7O0,21686
14
14
  udata/sitemap.py,sha256=oRRWoPI7ZsFFnUAOqGT1YuXFFKHBe8EcRnUCNHD7xjM,979
15
15
  udata/tags.py,sha256=ydq4uokd6bzdeGVSpEXASVtGvDfO2LfQs9mptvvKJCM,631
16
16
  udata/tasks.py,sha256=Sv01dhvATtq_oHOBp3J1j1VT1HQe0Pab7zxwIeIdKoo,5122
@@ -34,7 +34,7 @@ udata/auth/mails.py,sha256=ggGfgYEgNLtF-p5HocrmuQAk0b6fteWtN4UzK3ZvMlA,1759
34
34
  udata/auth/password_validation.py,sha256=ODVdEsiXbtq_8ws4Yf3hs5Sq7jz-IDa1RxAm_WPIJnA,1806
35
35
  udata/auth/proconnect.py,sha256=hsvQ71Hqy42NvwgYtcMniRXWax3Q7LX1INcmaS7gaIQ,5073
36
36
  udata/auth/views.py,sha256=83nlLQiRlqLPILPOkKilNnrTJTs6c7XKqe-nt2I5l8s,7861
37
- udata/commands/__init__.py,sha256=Won_rW_hIU9TA3o4oNe6kI46l1fnDBM_oW0Hc1XS9F8,7711
37
+ udata/commands/__init__.py,sha256=y0ncKCjOTuxm9Tn8A0WMvub1Mai0wLQ15dq9y8GRV7c,7692
38
38
  udata/commands/cache.py,sha256=bLdrf_fCWFYX9ULlL2ADsZRwijkI4pArsJxfx24OivM,341
39
39
  udata/commands/db.py,sha256=OyVBcuSIqYqNywlZAi19F2yRJCAIdFKKyQ9H9alqbfI,20426
40
40
  udata/commands/dcat.py,sha256=f6jT2AGZem-w1CaRH_ahfWB9A4oCDvjG13tPmBpeCqw,3910
@@ -92,7 +92,7 @@ udata/core/dataservices/tasks.py,sha256=fHG1r5ymfJRXJ_Lug6je3VKZoK30XKXE2rQ8x0R-
92
92
  udata/core/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
93
  udata/core/dataset/actions.py,sha256=mX6xox0PiMrbcAPZ3VZsI26rfM-ciYfEXxN6sqqImKA,1222
94
94
  udata/core/dataset/activities.py,sha256=eGxMUnC47YHxTgcls6igQ3qP7cYgwFtPfj0asCylGsI,3315
95
- udata/core/dataset/api.py,sha256=4KjNwgJ5PGV6WWyGaJdpiivWoNYs79BeHH5e509GV7s,35861
95
+ udata/core/dataset/api.py,sha256=JBdYH2RXac1WVOF67ZzBSvImVxixMfN0rxEknnfthRU,35338
96
96
  udata/core/dataset/api_fields.py,sha256=p7ZnmGNImZ4sgZTpoyHpI0CgOukpEIx8QdGnxlmgl2I,18032
97
97
  udata/core/dataset/apiv2.py,sha256=1H4557ZMi6rwEyrwB1Ha20m0bf3Avhg_vDLiDQt5Fi0,21030
98
98
  udata/core/dataset/commands.py,sha256=3mKSdJ-M7ggdG29AVn77C4ouZanbYoqkTaGQoBKOp3s,3471
@@ -103,7 +103,7 @@ udata/core/dataset/exceptions.py,sha256=uKiayLSpSzsnLvClObS6hOO0qXEqvURKN7_w8eim
103
103
  udata/core/dataset/factories.py,sha256=tb18axsk8Tx5iUIqWM9IELdt-2Ryp2UN0-iY4fdea4U,9059
104
104
  udata/core/dataset/forms.py,sha256=gGXOqy3WXFNvWMXngDq3TEqMM18-9DBpy2V7msrOsTw,7084
105
105
  udata/core/dataset/metrics.py,sha256=s8Xs_rqRXfNWsErkiJTuRMG5o_cU5iSK8mUJFKVSc7w,1204
106
- udata/core/dataset/models.py,sha256=Ny79cFiD7vOJt0JIfOZ7AJQOBx-_EEWPxdnhBpUr_-E,42980
106
+ udata/core/dataset/models.py,sha256=1iXf6l0wjNv04if6gAq-NXSPLub2Mx2KOyCg2RQtids,41617
107
107
  udata/core/dataset/permissions.py,sha256=zXQ6kU-Ni3Pl5tDtat-ZPupug9InsNeCN7xRLc2Vcrc,1097
108
108
  udata/core/dataset/preview.py,sha256=uFEpK-p5nIAlY8hVOMhd7mtkwFt6C_PQRMNxPvAyoo4,839
109
109
  udata/core/dataset/rdf.py,sha256=jHyHgay3g3Z04Ju5kLyMk3Vyb-kWJYtqqg8Smamv7WI,32970
@@ -356,12 +356,6 @@ udata/harvest/tests/dcat/partial-collection-1.jsonld,sha256=emPZGvpdaqgIVTgtmlWN
356
356
  udata/harvest/tests/dcat/partial-collection-2.jsonld,sha256=zJ1ggcs2b4IJBDJ6zKJn8w5arjJBU_EHr6qFd8tu0I8,3691
357
357
  udata/harvest/tests/dcat/sig.oreme.rdf,sha256=6F1P-hPyE8bZC-5uQTskTawRd19U0opIy9LCDwp0sd4,6315
358
358
  udata/harvest/tests/dcat/udata.xml,sha256=co7tLKinEdPOwEScHUXSqSAKAFSgLElYOCI3uu50Sgo,14532
359
- udata/linkchecker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
360
- udata/linkchecker/backends.py,sha256=Xe_nWwjKsdMv7kgs3mbQDppVO4lqaJlboUxa_0nk1XM,1018
361
- udata/linkchecker/checker.py,sha256=LB6WSMp5yvdmfdlq2ygp7uHoYEVsK0IPk6qnmhrS4GI,2903
362
- udata/linkchecker/commands.py,sha256=14c78P0FTLncuJpAgPizoCY62bUIknKumev1FPZ4JQU,454
363
- udata/linkchecker/models.py,sha256=V5PBSOGnCXZrZLoFQwKb91enpgVTbX4Q3_oWOPihJic,423
364
- udata/linkchecker/tasks.py,sha256=jETwfn6v-mP4jh_ovz4EwlxBUcRij_wHJVK1AmC0qo4,1918
365
359
  udata/migrations/2020-07-24-remove-s-from-scope-oauth.py,sha256=bCfBRcd4azHSV-fpZQaiHkWu7likHoWNpq9PeJvi5S0,665
366
360
  udata/migrations/2020-08-24-add-fs-filename.py,sha256=7mz5ufFvY67NDKEmQn0HS3u1SdR9uH077ngJfd2uot4,1819
367
361
  udata/migrations/2020-09-28-update-reuses-datasets-metrics.py,sha256=d7zizz1fwy-jCOJWmfveBpBZJSHJu46HSIBf3Xnd0eI,407
@@ -404,13 +398,13 @@ udata/migrations/2025-07-30-purge-old-harvest-dynamic-fields.py,sha256=ijeu6WvX6
404
398
  udata/migrations/2025-09-04-update-legacy-frequencies.py,sha256=8YAROAHhytf6Kses_54aFscmPNes2aHRYqTwNpawdVk,1693
405
399
  udata/migrations/2025-10-01-delete-orphaned-topic-elements.py,sha256=Mhx5ANOihZL4botxtjvfsll-xKBtkVQBkPcSq0BJ-Ec,788
406
400
  udata/migrations/__init__.py,sha256=RBCBDaTlLjuMs_Qzwji6Z6T4r7FCGXhESKoxQbT5qAA,11221
407
- udata/models/__init__.py,sha256=77OriDFm4dJzQDIb-7ADKSkf9GAxkXbMHeWXoYYnTsk,1459
401
+ udata/models/__init__.py,sha256=RT0WaPKm2linlYnTmD0xck7M2XuIoIJ94E0ErzNGzaw,1418
408
402
  udata/mongo/__init__.py,sha256=y4Rv-kq3o_kcEulcNpePLzocXPBNpx3Jd82G-VZPaMc,1421
409
403
  udata/mongo/datetime_fields.py,sha256=xACagQZu1OKPvpcznI-bMC1tJfAvo-VBUe7OOadnBdg,2089
410
404
  udata/mongo/document.py,sha256=yJl4rzE0L69SvNbtmnmyCALTGhXwBPrj7nvM-J6sDpE,1792
411
405
  udata/mongo/engine.py,sha256=JF9N55j7joDIn9NrItMtlIrA5CwVLhS_jlB2ptX94oA,2408
412
406
  udata/mongo/errors.py,sha256=SpTMAc_aNIfGkqyXGCbTlIAmYxU86rGM_NtIYaB642c,472
413
- udata/mongo/extras_fields.py,sha256=FfyVvRkpLn4pUeCqwI33NBJblHOywGlnA05RCEZ-ugs,4139
407
+ udata/mongo/extras_fields.py,sha256=knb0fwt8eIEOA0jjeUAs9Gmn_cfUNVPuRdGEVcJzE2Y,4218
414
408
  udata/mongo/queryset.py,sha256=fXfYkUHsCWAUoub3OR7v825USPv-PQQIHkv4U5FnjYg,3954
415
409
  udata/mongo/slug_fields.py,sha256=tEUlwozrdQfF42KR5dxk5PUNSX7zISTIXsSgHxR4YMg,7522
416
410
  udata/mongo/taglist_field.py,sha256=RPi8DlgMEMK1wk8hbQDLAyH2GnzZCfNpWXQsllxPB6g,1371
@@ -504,7 +498,6 @@ udata/tests/test_api_fields.py,sha256=NCUTtOMEaTM5-tK-YUxhjEud2IPIDOHR3vbZWAQdEC
504
498
  udata/tests/test_cors.py,sha256=i1SQS91lm-i3YEUqoHKUFpOI7TCpDx89MzHoWd3r2uk,2932
505
499
  udata/tests/test_dcat_commands.py,sha256=fDAnAjkja8AXw_qzaAWnVTgglkBAvK2mjPMHUCtqrrU,919
506
500
  udata/tests/test_discussions.py,sha256=a2yBGfOSm93w8zP2s2gXy51LGniiZ0lbrej0uuXNd2E,47685
507
- udata/tests/test_linkchecker.py,sha256=W8jrwKYXM8wWXZFjiaBwpWGRBhZ8bsSHGHzL9voDN7U,10218
508
501
  udata/tests/test_mail.py,sha256=f-8meP9r1Xrz0eOTsvmdynoV9OFHLwwMr7XM5WWv_gk,4182
509
502
  udata/tests/test_migrations.py,sha256=Iq0gt3hjK6ohliIvEpqyt0tYJz0MKIzBg_yfw7dOcHo,15698
510
503
  udata/tests/test_model.py,sha256=mwWf8tbh4iFVzqIpTvpobjfCQOdo8EJ5x3t8mXHmd7g,21999
@@ -632,9 +625,9 @@ udata/translations/pt/LC_MESSAGES/udata.mo,sha256=nv80xZLfIfUsSOMBcr29L268FDc_Gt
632
625
  udata/translations/pt/LC_MESSAGES/udata.po,sha256=bUp-7Ray8t8ALgJk3Icw1jmiGIc9_pEJQHiGw_2EU2o,50989
633
626
  udata/translations/sr/LC_MESSAGES/udata.mo,sha256=Y_XpUxD074wXc63oJTnoVOyOQ2lmBxl-MrgluZ0Qdw4,27961
634
627
  udata/translations/sr/LC_MESSAGES/udata.po,sha256=qh8mrz9AFuVQtXYSSP4QWsXLM_Lv3EHVifHT1NflWXY,57529
635
- udata-12.0.2.dev9.dist-info/licenses/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
636
- udata-12.0.2.dev9.dist-info/METADATA,sha256=BBbAgWGTEx3bRQ_tAsh5fbWIao80z-0khr74HXfnQso,5272
637
- udata-12.0.2.dev9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
638
- udata-12.0.2.dev9.dist-info/entry_points.txt,sha256=v2u12qO11i2lyLNIp136WmLJ-NHT-Kew3Duu8J-AXPM,614
639
- udata-12.0.2.dev9.dist-info/top_level.txt,sha256=EF6CE6YSHd_og-8LCEA4q25ALUpWVe8D0okOLdMAE3A,6
640
- udata-12.0.2.dev9.dist-info/RECORD,,
628
+ udata-12.0.2.dev11.dist-info/licenses/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
629
+ udata-12.0.2.dev11.dist-info/METADATA,sha256=Yno-kuZXIaNIlsxQaFAEjEfwiWeFtm-i4tzMXULFLYs,5273
630
+ udata-12.0.2.dev11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
631
+ udata-12.0.2.dev11.dist-info/entry_points.txt,sha256=v2u12qO11i2lyLNIp136WmLJ-NHT-Kew3Duu8J-AXPM,614
632
+ udata-12.0.2.dev11.dist-info/top_level.txt,sha256=EF6CE6YSHd_og-8LCEA4q25ALUpWVe8D0okOLdMAE3A,6
633
+ udata-12.0.2.dev11.dist-info/RECORD,,
File without changes
@@ -1,31 +0,0 @@
1
- import logging
2
- from datetime import datetime
3
-
4
- from flask import current_app
5
-
6
- from udata.entrypoints import get_enabled
7
-
8
- log = logging.getLogger(__name__)
9
-
10
-
11
- ENTRYPOINT = "udata.linkcheckers"
12
-
13
-
14
- class NoCheckLinkchecker(object):
15
- """Dummy linkchecker for resources that need no check"""
16
-
17
- def check(self, _):
18
- return {"check:status": 204, "check:available": True, "check:date": datetime.utcnow()}
19
-
20
-
21
- def get(name):
22
- """Get a linkchecker given its name or fallback on default"""
23
- linkcheckers = get_enabled(ENTRYPOINT, current_app)
24
- linkcheckers.update(no_check=NoCheckLinkchecker) # no_check always enabled
25
- selected_linkchecker = linkcheckers.get(name)
26
- if not selected_linkchecker:
27
- default_linkchecker = current_app.config.get("LINKCHECKING_DEFAULT_LINKCHECKER")
28
- selected_linkchecker = linkcheckers.get(default_linkchecker)
29
- if not selected_linkchecker:
30
- log.error("No linkchecker found ({} requested and no fallback)".format(name))
31
- return selected_linkchecker
@@ -1,75 +0,0 @@
1
- from urllib.parse import urlparse
2
-
3
- from flask import current_app
4
-
5
- from .backends import NoCheckLinkchecker
6
- from .backends import get as get_linkchecker
7
-
8
-
9
- def _get_check_keys(the_dict, resource, previous_status):
10
- check_keys = {k: v for k, v in the_dict.items() if k.startswith("check:")}
11
- check_keys["check:count-availability"] = _compute_count_availability(
12
- resource, check_keys.get("check:available"), previous_status
13
- )
14
- return check_keys
15
-
16
-
17
- def _compute_count_availability(resource, status, previous_status):
18
- """Compute the `check:count-availability` extra value"""
19
- count_availability = resource.extras.get("check:count-availability", 1)
20
- return count_availability + 1 if status == previous_status else 1
21
-
22
-
23
- def is_ignored(resource):
24
- """Check if the resource's URL is to be ignored"""
25
- ignored_domains = current_app.config["LINKCHECKING_IGNORE_DOMAINS"]
26
- ignored_patterns = current_app.config["LINKCHECKING_IGNORE_PATTERNS"]
27
- url = resource.url
28
- if not url:
29
- return True
30
- parsed_url = urlparse(url)
31
- ignored_domains_match = parsed_url.netloc in ignored_domains
32
- ignored_patterns_match = any([p in url for p in ignored_patterns])
33
- return ignored_domains_match or ignored_patterns_match
34
-
35
-
36
- def dummy_check_response():
37
- """Trigger a dummy check"""
38
- return NoCheckLinkchecker().check(None)
39
-
40
-
41
- def check_resource(resource):
42
- """
43
- Check a resource availability against a linkchecker backend
44
-
45
- The linkchecker used can be configured on a resource basis by setting
46
- the `resource.extras['check:checker']` attribute with a key that points
47
- to a valid `udata.linkcheckers` entrypoint. If not set, it will
48
- fallback on the default linkchecker defined by the configuration variable
49
- `LINKCHECKING_DEFAULT_LINKCHECKER`.
50
-
51
- Returns
52
- -------
53
- dict or (dict, int)
54
- Check results dict and status code (if error).
55
- """
56
- linkchecker_type = resource.extras.get("check:checker")
57
- LinkChecker = get_linkchecker(linkchecker_type)
58
- if not LinkChecker:
59
- return {"error": "No linkchecker configured."}, 503
60
- if is_ignored(resource):
61
- return dummy_check_response()
62
- result = LinkChecker().check(resource)
63
- if not result:
64
- return {"error": "No response from linkchecker"}, 503
65
- elif result.get("check:error"):
66
- return {"error": result["check:error"]}, 500
67
- elif not result.get("check:status"):
68
- return {"error": "No status in response from linkchecker"}, 503
69
- # store the check result in the resource's extras
70
- # XXX maybe this logic should be in the `Resource` model?
71
- previous_status = resource.extras.get("check:available")
72
- check_keys = _get_check_keys(result, resource, previous_status)
73
- resource.extras.update(check_keys)
74
- resource.save(signal_kwargs={"ignores": ["post_save"]}) # Prevent signal triggering on dataset
75
- return result
@@ -1,21 +0,0 @@
1
- import logging
2
-
3
- import click
4
-
5
- from udata.commands import cli
6
- from udata.linkchecker.tasks import check_resources
7
-
8
- log = logging.getLogger(__name__)
9
-
10
-
11
- @cli.group("linkchecker")
12
- def grp():
13
- """Link checking operations"""
14
- pass
15
-
16
-
17
- @grp.command()
18
- @click.option("-n", "--number", type=int, default=5000, help="Number of URLs to check")
19
- def check(number):
20
- """Check <number> of URLs that have not been (recently) checked"""
21
- check_resources(number)
@@ -1,9 +0,0 @@
1
- from udata.core.dataset.models import ResourceMixin
2
- from udata.mongo import db
3
-
4
- # Register harvest extras
5
- ResourceMixin.extras.register("check:available", db.BooleanField)
6
- ResourceMixin.extras.register("check:count-availability", db.IntField)
7
- ResourceMixin.extras.register("check:status", db.IntField)
8
- ResourceMixin.extras.register("check:url", db.StringField)
9
- ResourceMixin.extras.register("check:date", db.DateTimeField)
@@ -1,55 +0,0 @@
1
- import logging
2
- import uuid
3
-
4
- from flask import current_app
5
-
6
- from udata.models import Dataset
7
- from udata.tasks import job
8
- from udata.utils import get_by
9
-
10
- from .checker import check_resource
11
-
12
- log = logging.getLogger(__name__)
13
-
14
-
15
- @job("check_resources")
16
- def check_resources(self, number):
17
- """Check <number> of URLs that have not been (recently) checked"""
18
- if not current_app.config.get("LINKCHECKING_ENABLED"):
19
- log.error("Link checking is disabled.")
20
- return
21
-
22
- base_pipeline = [
23
- {"$match": {"resources": {"$gt": []}}},
24
- {"$project": {"resources._id": True, "resources.extras.check:date": True}},
25
- {"$unwind": "$resources"},
26
- ]
27
- # unchecked resources
28
- pipeline = base_pipeline + [
29
- {"$match": {"resources.extras.check:date": {"$eq": None}}},
30
- {"$limit": number},
31
- ]
32
- resources = list(Dataset.objects.aggregate(*pipeline))
33
- # not recently checked resources
34
- slots_left = number - len(resources)
35
- if slots_left:
36
- pipeline = base_pipeline + [
37
- {"$match": {"resources.extras.check:date": {"$ne": None}}},
38
- {"$sort": {"resources.extras.check:date": 1}},
39
- {"$limit": slots_left},
40
- ]
41
- resources += list(Dataset.objects.aggregate(*pipeline))
42
-
43
- nb_resources = len(resources)
44
- log.info("Checking %s resources...", nb_resources)
45
- for idx, dataset_resource in enumerate(resources):
46
- dataset_obj = Dataset.objects.get(id=dataset_resource["_id"])
47
- resource_id = dataset_resource["resources"]["_id"]
48
- rid = uuid.UUID(resource_id)
49
- resource_obj = get_by(dataset_obj.resources, "id", rid)
50
- log.info("Checking resource %s (%s/%s)", resource_id, idx + 1, nb_resources)
51
- if resource_obj.need_check():
52
- check_resource(resource_obj)
53
- else:
54
- log.info("--> Skipping this resource, cache is fresh enough.")
55
- log.info("Done.")
@@ -1,277 +0,0 @@
1
- from datetime import datetime, timedelta
2
-
3
- import mock
4
- import pytest
5
-
6
- from udata.auth import login_user
7
- from udata.core.activity import init_app as init_activity
8
- from udata.core.activity.models import Activity
9
- from udata.core.dataset.factories import DatasetFactory, ResourceFactory
10
- from udata.core.user.factories import UserFactory
11
- from udata.linkchecker.checker import check_resource
12
- from udata.settings import Testing
13
- from udata.tests import TestCase
14
-
15
-
16
- class LinkcheckerTestSettings(Testing):
17
- LINKCHECKING_ENABLED = True
18
- LINKCHECKING_IGNORE_DOMAINS = ["example-ignore.com"]
19
- LINKCHECKING_IGNORE_PATTERNS = ["format=shp"]
20
- LINKCHECKING_MIN_CACHE_DURATION = 0.5
21
- LINKCHECKING_UNAVAILABLE_THRESHOLD = 100
22
- LINKCHECKING_MAX_CACHE_DURATION = 100
23
-
24
-
25
- @pytest.fixture
26
- def activity_app(app):
27
- init_activity(app)
28
- yield app
29
-
30
-
31
- def test_check_resource_creates_no_activity(activity_app, mocker):
32
- resource = ResourceFactory()
33
- dataset = DatasetFactory(resources=[resource])
34
- user = UserFactory()
35
- login_user(user)
36
- check_res = {"check:status": 200, "check:available": True, "check:date": datetime.utcnow()}
37
-
38
- class DummyLinkchecker:
39
- def check(self, _):
40
- return check_res
41
-
42
- mocker.patch("udata.linkchecker.checker.get_linkchecker", return_value=DummyLinkchecker)
43
-
44
- check_resource(resource)
45
-
46
- activities = Activity.objects.filter(related_to=dataset)
47
- assert len(activities) == 0
48
-
49
-
50
- class LinkcheckerTest(TestCase):
51
- settings = LinkcheckerTestSettings
52
-
53
- def setUp(self):
54
- self.resource = ResourceFactory()
55
- self.dataset = DatasetFactory(resources=[self.resource])
56
-
57
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
58
- def test_check_resource_no_linkchecker(self, mock_fn):
59
- mock_fn.return_value = None
60
- res = check_resource(self.resource)
61
- self.assertEqual(res, ({"error": "No linkchecker configured."}, 503))
62
-
63
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
64
- def test_check_resource_linkchecker_ok(self, mock_fn):
65
- check_res = {"check:status": 200, "check:available": True, "check:date": datetime.utcnow()}
66
-
67
- class DummyLinkchecker:
68
- def check(self, _):
69
- return check_res
70
-
71
- mock_fn.return_value = DummyLinkchecker
72
-
73
- res = check_resource(self.resource)
74
- self.assertEqual(res, check_res)
75
- check_res.update({"check:count-availability": 1})
76
- self.assertEqual(self.resource.extras, check_res)
77
-
78
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
79
- def test_check_resource_filter_result(self, mock_fn):
80
- check_res = {"check:status": 200, "dummy": "dummy"}
81
-
82
- class DummyLinkchecker:
83
- def check(self, _):
84
- return check_res
85
-
86
- mock_fn.return_value = DummyLinkchecker
87
-
88
- res = check_resource(self.resource)
89
- self.assertEqual(res, check_res)
90
- self.assertNotIn("dummy", self.resource.extras)
91
-
92
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
93
- def test_check_resource_linkchecker_no_status(self, mock_fn):
94
- class DummyLinkchecker:
95
- def check(self, _):
96
- return {"check:available": True}
97
-
98
- mock_fn.return_value = DummyLinkchecker
99
- res = check_resource(self.resource)
100
- self.assertEqual(res, ({"error": "No status in response from linkchecker"}, 503))
101
-
102
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
103
- def test_check_resource_linkchecker_check_error(self, mock_fn):
104
- class DummyLinkchecker:
105
- def check(self, _):
106
- return {"check:error": "ERROR"}
107
-
108
- mock_fn.return_value = DummyLinkchecker
109
- res = check_resource(self.resource)
110
- self.assertEqual(res, ({"error": "ERROR"}, 500))
111
-
112
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
113
- def test_check_resource_linkchecker_in_resource(self, mock_fn):
114
- self.resource.extras["check:checker"] = "another_linkchecker"
115
- self.resource.save()
116
- check_resource(self.resource)
117
- args, kwargs = mock_fn.call_args
118
- self.assertEqual(args, ("another_linkchecker",))
119
-
120
- def test_check_resource_linkchecker_no_check(self):
121
- self.resource.extras["check:checker"] = "no_check"
122
- self.resource.save()
123
- res = check_resource(self.resource)
124
- self.assertEqual(res.get("check:status"), 204)
125
- self.assertEqual(res.get("check:available"), True)
126
-
127
- def test_check_resource_ignored_domain(self):
128
- self.resource.extras = {}
129
- self.resource.url = "http://example-ignore.com/url"
130
- self.resource.save()
131
- res = check_resource(self.resource)
132
- self.assertEqual(res.get("check:status"), 204)
133
- self.assertEqual(res.get("check:available"), True)
134
-
135
- def test_check_resource_ignored_pattern(self):
136
- self.resource.extras = {}
137
- self.resource.url = "http://example.com/url?format=shp"
138
- self.resource.save()
139
- res = check_resource(self.resource)
140
- self.assertEqual(res.get("check:status"), 204)
141
- self.assertEqual(res.get("check:available"), True)
142
-
143
- def test_is_need_check(self):
144
- self.resource.extras = {
145
- "check:available": True,
146
- "check:date": datetime.utcnow(),
147
- "check:status": 42,
148
- }
149
- self.assertFalse(self.resource.need_check())
150
-
151
- def test_is_need_check_unknown_status(self):
152
- self.resource.extras = {}
153
- self.assertTrue(self.resource.need_check())
154
-
155
- def test_is_need_check_cache_expired(self):
156
- self.resource.extras = {
157
- "check:available": True,
158
- "check:date": datetime.utcnow() - timedelta(seconds=3600),
159
- "check:status": 42,
160
- }
161
- self.assertTrue(self.resource.need_check())
162
-
163
- def test_is_need_check_date_string(self):
164
- check_date = (datetime.utcnow() - timedelta(seconds=3600)).isoformat()
165
- self.resource.extras = {
166
- "check:available": True,
167
- "check:date": check_date,
168
- "check:status": 42,
169
- }
170
- self.assertTrue(self.resource.need_check())
171
-
172
- def test_is_need_check_wrong_check_date(self):
173
- check_date = "123azerty"
174
- self.resource.extras = {
175
- "check:available": True,
176
- "check:date": check_date,
177
- "check:status": 42,
178
- }
179
- self.assertTrue(self.resource.need_check())
180
-
181
- def test_is_need_check_wrong_check_date_int(self):
182
- check_date = 42
183
- self.resource.extras = {
184
- "check:available": True,
185
- "check:date": check_date,
186
- "check:status": 42,
187
- }
188
- self.assertTrue(self.resource.need_check())
189
-
190
- def test_is_need_check_count_availability(self):
191
- self.resource.extras = {
192
- # should need a new check after 100 * 30s = 3000s < 3600s
193
- "check:count-availability": 100,
194
- "check:available": True,
195
- "check:date": datetime.utcnow() - timedelta(seconds=3600),
196
- "check:status": 42,
197
- }
198
- self.assertTrue(self.resource.need_check())
199
-
200
- def test_is_need_check_count_availability_expired(self):
201
- self.resource.extras = {
202
- # should need a new check after 150 * 30s = 4500s > 3600s
203
- "check:count-availability": 150,
204
- "check:available": True,
205
- "check:date": datetime.utcnow() - timedelta(seconds=3600),
206
- "check:status": 42,
207
- }
208
- self.assertFalse(self.resource.need_check())
209
-
210
- def test_is_need_check_count_availability_unavailable(self):
211
- self.resource.extras = {
212
- # should need a new check after 30s < 3600S
213
- # count-availability is below threshold
214
- "check:count-availability": 95,
215
- "check:available": False,
216
- "check:date": datetime.utcnow() - timedelta(seconds=3600),
217
- "check:status": 42,
218
- }
219
- self.assertTrue(self.resource.need_check())
220
-
221
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
222
- def test_count_availability_increment(self, mock_fn):
223
- check_res = {"check:status": 200, "check:available": True, "check:date": datetime.utcnow()}
224
-
225
- class DummyLinkchecker:
226
- def check(self, _):
227
- return check_res
228
-
229
- mock_fn.return_value = DummyLinkchecker
230
-
231
- check_resource(self.resource)
232
- self.assertEqual(self.resource.extras["check:count-availability"], 1)
233
-
234
- check_resource(self.resource)
235
- self.assertEqual(self.resource.extras["check:count-availability"], 2)
236
-
237
- @mock.patch("udata.linkchecker.checker.get_linkchecker")
238
- def test_count_availability_reset(self, mock_fn):
239
- self.resource.extras = {
240
- "check:status": 200,
241
- "check:available": True,
242
- "check:date": datetime.utcnow(),
243
- "check:count-availability": 2,
244
- }
245
- check_res = {"check:status": 200, "check:available": False, "check:date": datetime.utcnow()}
246
-
247
- class DummyLinkchecker:
248
- def check(self, _):
249
- return check_res
250
-
251
- mock_fn.return_value = DummyLinkchecker
252
-
253
- check_resource(self.resource)
254
- self.assertEqual(self.resource.extras["check:count-availability"], 1)
255
-
256
- def test_count_availability_threshold(self):
257
- self.resource.extras = {
258
- "check:status": 404,
259
- "check:available": False,
260
- # if it weren't above threshold, should need check (>30s)
261
- # and we're still below max_cache 101 * 0.5 < 100
262
- "check:date": datetime.utcnow() - timedelta(seconds=60),
263
- "check:count-availability": 101,
264
- }
265
- self.assertFalse(self.resource.need_check())
266
-
267
- def test_count_availability_max_cache_duration(self):
268
- self.resource.extras = {
269
- "check:status": 200,
270
- "check:available": True,
271
- # next check should be at 300 * 0.5 = 150min
272
- # but we are above max cache duration 150min > 100min
273
- # and 120m > 100 min so we should need a new check
274
- "check:date": datetime.utcnow() - timedelta(minutes=120),
275
- "check:count-availability": 300,
276
- }
277
- self.assertTrue(self.resource.need_check())