wbnews 1.58.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. wbnews/.coveragerc +23 -0
  2. wbnews/__init__.py +1 -0
  3. wbnews/admin.py +30 -0
  4. wbnews/apps.py +9 -0
  5. wbnews/factories.py +36 -0
  6. wbnews/filters/__init__.py +1 -0
  7. wbnews/filters/news.py +46 -0
  8. wbnews/fixtures/wbnews.yaml +1 -0
  9. wbnews/import_export/__init__.py +0 -0
  10. wbnews/import_export/backends/__init__.py +1 -0
  11. wbnews/import_export/backends/news.py +36 -0
  12. wbnews/import_export/handlers/__init__.py +1 -0
  13. wbnews/import_export/handlers/news.py +57 -0
  14. wbnews/import_export/parsers/__init__.py +0 -0
  15. wbnews/import_export/parsers/emails/__init__.py +0 -0
  16. wbnews/import_export/parsers/emails/news.py +39 -0
  17. wbnews/import_export/parsers/emails/utils.py +65 -0
  18. wbnews/import_export/parsers/rss/__init__.py +0 -0
  19. wbnews/import_export/parsers/rss/news.py +58 -0
  20. wbnews/locale/de/LC_MESSAGES/django.mo +0 -0
  21. wbnews/locale/de/LC_MESSAGES/django.po +166 -0
  22. wbnews/locale/de/LC_MESSAGES/django.po.translated +173 -0
  23. wbnews/locale/en/LC_MESSAGES/django.mo +0 -0
  24. wbnews/locale/en/LC_MESSAGES/django.po +159 -0
  25. wbnews/locale/fr/LC_MESSAGES/django.mo +0 -0
  26. wbnews/locale/fr/LC_MESSAGES/django.po +162 -0
  27. wbnews/migrations/0001_initial_squashed_0005_alter_news_import_source.py +349 -0
  28. wbnews/migrations/0006_alter_news_language.py +122 -0
  29. wbnews/migrations/0007_auto_20240103_0955.py +43 -0
  30. wbnews/migrations/0008_alter_news_language.py +123 -0
  31. wbnews/migrations/0009_newsrelationship_analysis_newsrelationship_sentiment.py +94 -0
  32. wbnews/migrations/0010_newsrelationship_important.py +17 -0
  33. wbnews/migrations/0011_newsrelationship_content_object_repr.py +18 -0
  34. wbnews/migrations/0012_alter_news_unique_together_news_identifier_and_more.py +91 -0
  35. wbnews/migrations/0013_alter_news_datetime.py +19 -0
  36. wbnews/migrations/0014_newsrelationship_unique_news_relationship.py +27 -0
  37. wbnews/migrations/__init__.py +0 -0
  38. wbnews/models/__init__.py +3 -0
  39. wbnews/models/llm/cleaned_news.py +66 -0
  40. wbnews/models/news.py +131 -0
  41. wbnews/models/relationships.py +45 -0
  42. wbnews/models/sources.py +73 -0
  43. wbnews/models/utils.py +15 -0
  44. wbnews/serializers.py +134 -0
  45. wbnews/signals.py +4 -0
  46. wbnews/tasks.py +16 -0
  47. wbnews/tests/__init__.py +0 -0
  48. wbnews/tests/conftest.py +6 -0
  49. wbnews/tests/parsers/__init__.py +0 -0
  50. wbnews/tests/parsers/test_emails.py +25 -0
  51. wbnews/tests/test_models.py +80 -0
  52. wbnews/tests/test_utils.py +7 -0
  53. wbnews/tests/tests.py +12 -0
  54. wbnews/urls.py +25 -0
  55. wbnews/utils.py +57 -0
  56. wbnews/viewsets/__init__.py +12 -0
  57. wbnews/viewsets/buttons.py +42 -0
  58. wbnews/viewsets/display.py +148 -0
  59. wbnews/viewsets/endpoints.py +34 -0
  60. wbnews/viewsets/menu.py +29 -0
  61. wbnews/viewsets/titles.py +44 -0
  62. wbnews/viewsets/views.py +168 -0
  63. wbnews-1.58.3.dist-info/METADATA +7 -0
  64. wbnews-1.58.3.dist-info/RECORD +65 -0
  65. wbnews-1.58.3.dist-info/WHEEL +5 -0
wbnews/models/news.py ADDED
@@ -0,0 +1,131 @@
1
+ from datetime import date
2
+ from typing import Any
3
+
4
+ from celery import chord, shared_task
5
+ from celery.canvas import Signature
6
+ from django.conf.global_settings import LANGUAGES
7
+ from django.contrib.postgres.fields import ArrayField
8
+ from django.db import models
9
+ from django.db.models.signals import post_save
10
+ from django.dispatch import receiver
11
+ from django.utils import timezone
12
+ from django.utils.translation import gettext_lazy as _
13
+ from slugify import slugify
14
+ from wbcore.contrib.ai.llm.decorators import llm
15
+ from wbcore.contrib.io.mixins import ImportMixin
16
+ from wbcore.models import WBModel
17
+
18
+ from wbnews.import_export.handlers.news import NewsImportHandler
19
+ from wbnews.models.llm.cleaned_news import clean_news_config, summarized_news_config
20
+ from wbnews.models.relationships import NewsRelationship
21
+ from wbnews.signals import create_news_relationships
22
+
23
+ from ..utils import detect_near_duplicates
24
+
25
+
26
+ @shared_task
27
+ def create_relationship(chain_results: list[list[dict[str, Any]]], news_id: int):
28
+ objs = []
29
+ for relationships in chain_results:
30
+ for relationship in relationships:
31
+ objs.append(NewsRelationship(news_id=news_id, **relationship))
32
+ NewsRelationship.objects.bulk_create(
33
+ objs,
34
+ ignore_conflicts=True,
35
+ unique_fields=["content_type", "object_id", "news"],
36
+ )
37
+
38
+
39
+ class DefaultObjectManager(models.Manager):
40
+ def get_queryset(self):
41
+ return super().get_queryset().filter(mark_as_duplicate=False)
42
+
43
+
44
+ @llm([clean_news_config, summarized_news_config])
45
+ class News(ImportMixin, WBModel):
46
+ errors = {
47
+ "relationship_signal": "using the fetch_new_relationships signal must return a list of tuples, sender: {0} did not."
48
+ }
49
+ import_export_handler_class = NewsImportHandler
50
+
51
+ datetime = models.DateTimeField(verbose_name=_("Datetime"), default=timezone.now)
52
+ title = models.CharField(max_length=500, verbose_name=_("Title"))
53
+ guid = models.CharField(max_length=1024, unique=True)
54
+ description = models.TextField(blank=True, verbose_name=_("Description"))
55
+ summary = models.TextField(blank=True, verbose_name=_("Summary"))
56
+ language = models.CharField(max_length=16, choices=LANGUAGES, blank=True, null=True, verbose_name=_("Language"))
57
+ link = models.URLField(max_length=1024, blank=True, null=True, verbose_name=_("Link"))
58
+ tags = ArrayField(models.CharField(max_length=16), default=list)
59
+ enclosures = ArrayField(models.URLField(), default=list)
60
+ source = models.ForeignKey(
61
+ "wbnews.NewsSource", on_delete=models.CASCADE, related_name="news", verbose_name=_("Source")
62
+ )
63
+ image_url = models.URLField(blank=True, null=True)
64
+ mark_as_duplicate = models.BooleanField(default=False, verbose_name=_("Mark as duplicate"))
65
+
66
+ objects = DefaultObjectManager()
67
+ all_objects = models.Manager()
68
+
69
+ def save(self, *args, **kwargs):
70
+ self.datetime = min(self.datetime, timezone.now()) # we ensure a news is never in the future
71
+ if self.guid is None:
72
+ self.guid = self.get_default_guid(self.title, self.link)
73
+ super().save(*args, **kwargs)
74
+
75
+ def __str__(self) -> str:
76
+ return f"{self.title} ({self.source.title})"
77
+
78
+ def update_and_create_news_relationships(self, synchronous: bool = False):
79
+ """
80
+ This methods fires the signal to fetch the possible relationship to be linked to the news
81
+ """
82
+ tasks = []
83
+ for sender, task_signature in create_news_relationships.send(sender=News, instance=self):
84
+ if not isinstance(task_signature, Signature):
85
+ raise AssertionError(self.errors["relationship_signal"].format(sender))
86
+ tasks.append(task_signature)
87
+ if tasks:
88
+ res = chord(tasks, create_relationship.s(self.id))
89
+ if synchronous:
90
+ res.apply()
91
+ else:
92
+ res.apply_async()
93
+
94
+ @classmethod
95
+ def get_default_guid(cls, title: str, link: str | None, max_length: int = 1024) -> str:
96
+ if link:
97
+ return link
98
+ return slugify(title)[0:max_length]
99
+
100
+ @classmethod
101
+ def get_representation_endpoint(cls) -> str:
102
+ return "wbnews:news-list"
103
+
104
+ @classmethod
105
+ def get_representation_value_key(cls) -> str:
106
+ return "id"
107
+
108
+ @classmethod
109
+ def get_representation_label_key(cls) -> str:
110
+ return "{{title}} ({{datetime}})"
111
+
112
+ @classmethod
113
+ def get_endpoint_basename(cls) -> str:
114
+ return "wbnews:news"
115
+
116
+ @classmethod
117
+ def handle_duplicates(cls, start: date, end: date, content_label: str = "description", threshold: float = 0.9):
118
+ qs = News.objects.filter(datetime__gte=start, datetime__lte=end)
119
+ data = dict(qs.values_list("id", content_label))
120
+ duplicate_ids = detect_near_duplicates(data, threshold=threshold)
121
+ qs.filter(id__in=duplicate_ids).update(mark_as_duplicate=True)
122
+
123
+
124
+ @receiver(post_save, sender="wbnews.News")
125
+ def post_save_create_news_relationships(sender: type, instance: "News", raw: bool, created: bool, **kwargs):
126
+ """
127
+ Post save to lazy create relationship between an instrument and a news upon creation
128
+ """
129
+
130
+ if not raw and created:
131
+ instance.update_and_create_news_relationships()
@@ -0,0 +1,45 @@
1
+ from django.contrib.contenttypes.fields import GenericForeignKey
2
+ from django.contrib.contenttypes.models import ContentType
3
+ from django.db import models
4
+ from django.utils.translation import gettext as _
5
+
6
+
7
+ class NewsRelationship(models.Model):
8
+ class SentimentChoices(models.IntegerChoices):
9
+ POSITIVE = 4, _("Positive")
10
+ SLIGHTLY_POSITIVE = 3, _("Slightly Positive")
11
+ SLIGHTLY_NEGATIVE = 2, _("Slightly Negative")
12
+ NEGATIVE = 1, _("Negative")
13
+
14
+ def get_color(self):
15
+ colors = {
16
+ "POSITIVE": "#96DD99",
17
+ "SLIGHTLY_POSITIVE": "#FFEE8C",
18
+ "SLIGHTLY_NEGATIVE": "#FF964F",
19
+ "NEGATIVE": "#FF6961",
20
+ }
21
+ return colors[self.name]
22
+
23
+ news = models.ForeignKey(to="wbnews.News", related_name="relationships", on_delete=models.CASCADE)
24
+ content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
25
+ object_id = models.PositiveIntegerField()
26
+ content_object = GenericForeignKey("content_type", "object_id")
27
+ content_object_repr = models.CharField(max_length=512, default="")
28
+
29
+ important = models.BooleanField(null=True, blank=True)
30
+ sentiment = models.PositiveIntegerField(null=True, blank=True, choices=SentimentChoices.choices)
31
+ analysis = models.TextField(null=True, blank=True)
32
+
33
+ def save(self, *args, **kwargs):
34
+ self.content_object_repr = str(self.content_object)
35
+ super().save(*args, **kwargs)
36
+
37
+ def __str__(self) -> str:
38
+ return f"{self.news.title} -> {self.content_object}"
39
+
40
+ class Meta:
41
+ verbose_name = "News Relationship"
42
+ indexes = [models.Index(fields=["content_type", "object_id"])]
43
+ constraints = [
44
+ models.UniqueConstraint(name="unique_news_relationship", fields=["content_type", "object_id", "news"])
45
+ ]
@@ -0,0 +1,73 @@
1
+ import re
2
+
3
+ from django.contrib.postgres.fields import ArrayField
4
+ from django.db import models
5
+ from wbcore.models import WBModel
6
+
7
+ from wbnews.models.utils import endpoint_to_author
8
+
9
+
10
+ class NewsSource(WBModel):
11
+ class Type(models.TextChoices):
12
+ RSS = "RSS", "RSS"
13
+ EMAIL = "EMAIL", "EMAIL"
14
+
15
+ type = models.CharField(default=Type.RSS, choices=Type.choices, max_length=6)
16
+ title = models.CharField(max_length=255)
17
+ identifier = models.CharField(max_length=255, unique=True, blank=True, null=True)
18
+ tags = ArrayField(models.CharField(max_length=16), default=list, blank=True)
19
+ image = models.URLField(blank=True, null=True)
20
+ description = models.TextField(default="", blank=True)
21
+ author = models.CharField(max_length=255, default="")
22
+ clean_content = models.BooleanField(default=False)
23
+ endpoint = models.CharField(max_length=1024, unique=True)
24
+ is_active = models.BooleanField(default=True)
25
+
26
+ def __str__(self):
27
+ return f"{self.title}"
28
+
29
+ def save(self, *args, **kwargs):
30
+ if not self.author and self.endpoint:
31
+ self.author = endpoint_to_author(self.endpoint)
32
+ super().save(*args, **kwargs)
33
+
34
+ @classmethod
35
+ def get_representation_endpoint(cls) -> str:
36
+ return "wbnews:sourcerepresentation-list"
37
+
38
+ @classmethod
39
+ def get_representation_value_key(cls) -> str:
40
+ return "id"
41
+
42
+ @classmethod
43
+ def get_representation_label_key(cls) -> str:
44
+ return "{{title}}"
45
+
46
+ @classmethod
47
+ def get_endpoint_basename(cls) -> str:
48
+ return "wbnews:source"
49
+
50
+ @classmethod
51
+ def source_dict_to_model(cls, data: dict):
52
+ sources = NewsSource.objects.all()
53
+ endpoint = data.pop("endpoint", None)
54
+ if "id" in data:
55
+ return sources.get(id=data["id"])
56
+ if type := data.get("type"):
57
+ sources = sources.filter(type=type)
58
+ if identifier := data.get("identifier"):
59
+ sources = sources.filter(identifier=identifier)
60
+ elif endpoint:
61
+ for source in sources:
62
+ match = re.search(source.endpoint, endpoint)
63
+ if source.endpoint == endpoint or match:
64
+ return source
65
+ if sources.count() == 1:
66
+ return sources.first()
67
+ else:
68
+ if endpoint:
69
+ # Pattern to capture and replace the local part of an email
70
+ pattern = r"^[^@]+"
71
+ # Replace the local part of an email with a wildcard regex
72
+ endpoint = re.sub(pattern, ".*", re.escape(endpoint))
73
+ return NewsSource.objects.create(**data, endpoint=endpoint)
wbnews/models/utils.py ADDED
@@ -0,0 +1,15 @@
1
+ from contextlib import suppress
2
+ from urllib.parse import urlparse
3
+
4
+
5
+ def endpoint_to_author(endpoint: str) -> str:
6
+ author = endpoint
7
+ if "@" in endpoint: # simplist way to check if the endpoint is an email address
8
+ author = author.replace("\\", "").split("@")[-1].split(".")
9
+ if len(author) > 1:
10
+ author = ".".join(author[:-1])
11
+ else: # otherwise we consider it's a valid url and we extract only the domain part
12
+ with suppress(ValueError, IndexError):
13
+ author = urlparse(author).netloc.split(".")[-2]
14
+
15
+ return author.replace("_", " ").title()
wbnews/serializers.py ADDED
@@ -0,0 +1,134 @@
1
+ from django.utils.translation import gettext_lazy as _
2
+ from rest_framework.reverse import reverse
3
+ from wbcore import serializers as wb_serializers
4
+ from wbcore.content_type.serializers import (
5
+ ContentTypeRepresentationSerializer,
6
+ DynamicObjectIDRepresentationSerializer,
7
+ )
8
+
9
+ from .models import News, NewsRelationship, NewsSource
10
+
11
+
12
+ class SourceRepresentationSerializer(wb_serializers.RepresentationSerializer):
13
+ _detail = wb_serializers.HyperlinkField(reverse_name="wbnews:source-detail")
14
+
15
+ class Meta:
16
+ model = NewsSource
17
+ fields = ("id", "title", "_detail")
18
+
19
+
20
+ class SourceModelSerializer(wb_serializers.ModelSerializer):
21
+ title = wb_serializers.CharField(read_only=True, label=_("Title"))
22
+ identifier = wb_serializers.CharField(read_only=True, label=_("Identifier"))
23
+ image = wb_serializers.CharField(read_only=True)
24
+ description = wb_serializers.CharField(read_only=True, label=_("Description"))
25
+ author = wb_serializers.CharField(read_only=True, label=_("Author"))
26
+ updated = wb_serializers.DateTimeField(read_only=True, label=_("Updated"))
27
+
28
+ @wb_serializers.register_resource()
29
+ def news(self, instance, request, user):
30
+ return {"news": reverse("wbnews:source-news-list", args=[instance.id], request=request)}
31
+
32
+ class Meta:
33
+ model = NewsSource
34
+ fields = ("id", "title", "identifier", "image", "description", "author", "updated", "_additional_resources")
35
+
36
+
37
+ class NewsRepresentationSerializer(wb_serializers.RepresentationSerializer):
38
+ _detail = wb_serializers.HyperlinkField(reverse_name="wbnews:news-detail")
39
+
40
+ class Meta:
41
+ model = News
42
+ fields = ("id", "datetime", "title", "_detail")
43
+
44
+
45
+ class NewsModelSerializer(wb_serializers.ModelSerializer):
46
+ _source = SourceRepresentationSerializer(source="source")
47
+ image_url = wb_serializers.ImageURLField()
48
+
49
+ @wb_serializers.register_resource()
50
+ def open_link(self, instance, request, user):
51
+ if instance.link:
52
+ return {"open_link": instance.link}
53
+ return {}
54
+
55
+ # link = wb_serializers.URL()
56
+ class Meta:
57
+ model = News
58
+ fields = (
59
+ "id",
60
+ "datetime",
61
+ "title",
62
+ "description",
63
+ "summary",
64
+ "link",
65
+ "language",
66
+ "image_url",
67
+ "source",
68
+ "_source",
69
+ "_additional_resources",
70
+ )
71
+
72
+
73
+ class NewsRelationshipModelSerializer(wb_serializers.ModelSerializer):
74
+ source = wb_serializers.PrimaryKeyCharField(read_only=True)
75
+ _source = SourceRepresentationSerializer(source="source")
76
+ title = wb_serializers.TextField(read_only=True, label=_("Title"))
77
+ description = wb_serializers.TextField(read_only=True, label=_("Description"))
78
+ summary = wb_serializers.TextField(read_only=True, label=_("Summary"))
79
+ datetime = wb_serializers.DateTimeField(read_only=True, label=_("Date"))
80
+ _content_type = ContentTypeRepresentationSerializer(source="content_type")
81
+ object_id = wb_serializers.CharField(label="Linked Object", required=False)
82
+ _object_id = DynamicObjectIDRepresentationSerializer(
83
+ content_type_field_name="content_type",
84
+ source="object_id",
85
+ optional_get_parameters={"content_type": "content_type"},
86
+ depends_on=[{"field": "content_type", "options": {}}],
87
+ filter_params={
88
+ "is_security": True
89
+ }, # TODO needs to find a way to not create a dependency to the wbfdm module here
90
+ )
91
+ news = wb_serializers.PrimaryKeyRelatedField(
92
+ queryset=News.objects.all(), read_only=lambda view: not view.new_mode, label=_("News")
93
+ )
94
+ _news = NewsRepresentationSerializer(source="news")
95
+
96
+ def validate(self, data):
97
+ if view := self.context["view"]:
98
+ if view.object_id:
99
+ data["object_id"] = view.object_id
100
+ if view.content_type:
101
+ data["content_type"] = view.content_type
102
+ return super().validate(data)
103
+
104
+ class Meta:
105
+ model = NewsRelationship
106
+ read_only_fields = (
107
+ "content_object_repr",
108
+ "datetime",
109
+ "title",
110
+ "description",
111
+ "summary",
112
+ "content_type",
113
+ "_content_type",
114
+ )
115
+ fields = (
116
+ "id",
117
+ "news",
118
+ "_news",
119
+ "content_object_repr",
120
+ "datetime",
121
+ "sentiment",
122
+ "analysis",
123
+ "important",
124
+ "title",
125
+ "description",
126
+ "summary",
127
+ "source",
128
+ "_source",
129
+ "content_type",
130
+ "_content_type",
131
+ "object_id",
132
+ "_object_id",
133
+ "_additional_resources",
134
+ )
wbnews/signals.py ADDED
@@ -0,0 +1,4 @@
1
+ from django.db.models.signals import ModelSignal
2
+
3
+ # this signal is gather the news relationship to efficiently handle the insertion.
4
+ create_news_relationships = ModelSignal(use_caching=False)
wbnews/tasks.py ADDED
@@ -0,0 +1,16 @@
1
+ from datetime import date, timedelta
2
+
3
+ from celery import shared_task
4
+
5
+ from wbnews.models import News
6
+
7
+
8
+ @shared_task()
9
+ def handle_daily_news_duplicates(
10
+ task_date: date | None = None,
11
+ day_interval: int = 7,
12
+ ):
13
+ if not task_date:
14
+ task_date = date.today()
15
+
16
+ News.handle_duplicates(task_date - timedelta(days=day_interval), task_date + timedelta(days=day_interval))
File without changes
@@ -0,0 +1,6 @@
1
+ from pytest_factoryboy import register
2
+ from wbcore.tests.conftest import *
3
+ from wbnews.factories import NewsFactory, NewsSourceFactory
4
+
5
+ register(NewsSourceFactory)
6
+ register(NewsFactory)
File without changes
@@ -0,0 +1,25 @@
1
+ from unittest.mock import PropertyMock, patch
2
+
3
+ import pytest
4
+
5
+ from wbnews.import_export.parsers.emails.utils import EmlContentParser
6
+
7
+
8
+ class TestEmlContentParser:
9
+ @pytest.fixture
10
+ def content_parser(self):
11
+ parser = EmlContentParser(b"")
12
+ parser.message = {"From": "main@acme.com"}
13
+ return parser
14
+
15
+ @patch.object(EmlContentParser, "text", new_callable=PropertyMock)
16
+ def test_source_from_in_text(self, mock_text, content_parser):
17
+ mock_text.return_value = (
18
+ "some random email content with a From field From: source name <email@test.com> and the rest of the email"
19
+ )
20
+ assert content_parser.source == {"title": "Source Name", "endpoint": "email@test.com", "type": "EMAIL"}
21
+
22
+ @patch.object(EmlContentParser, "text", new_callable=PropertyMock)
23
+ def test_source_from_in_text_alt(self, mock_text, content_parser):
24
+ mock_text.return_value = "some random email content without a From field"
25
+ assert content_parser.source == {"title": "Acme.Com", "endpoint": "main@acme.com", "type": "EMAIL"}
@@ -0,0 +1,80 @@
1
+ from datetime import timedelta, timezone
2
+ from unittest.mock import patch
3
+
4
+ import pytest
5
+ from django.utils import timezone as django_timezone
6
+ from faker import Faker
7
+
8
+ from wbnews.models import News, NewsSource
9
+
10
+ fake = Faker()
11
+
12
+
13
+ @pytest.mark.django_db
14
+ class TestSource:
15
+ @pytest.mark.parametrize("news_source__title", ["source1"])
16
+ def test_str(self, news_source):
17
+ assert str(news_source) == f"{news_source.title}"
18
+
19
+ def test_source_dict_to_model(self, news_source_factory):
20
+ ns1 = news_source_factory.create()
21
+ ns2 = news_source_factory.create()
22
+
23
+ assert NewsSource.source_dict_to_model({"id": ns1.id, "identifier": ns2.identifier}) == ns1 # priority to "id"
24
+ assert (
25
+ NewsSource.source_dict_to_model({"endpoint": ns1.endpoint, "identifier": ns2.identifier}) == ns2
26
+ ) # priority to "identifier"
27
+ assert NewsSource.source_dict_to_model({"endpoint": ns2.endpoint}) == ns2 # exact match on endpoint
28
+
29
+ ns1.endpoint = ".*@test.com"
30
+ ns1.save()
31
+ assert NewsSource.source_dict_to_model({"endpoint": "abc@test.com"}) == ns1 # regex match on endpoint
32
+
33
+ new_source = NewsSource.source_dict_to_model({"endpoint": "abc@main_source.com", "title": "New Source"})
34
+ assert new_source not in [ns1, ns2]
35
+ assert new_source.endpoint == r".*@main_source\.com"
36
+ assert new_source.title == "New Source"
37
+ assert new_source.author == "Main Source"
38
+
39
+
40
+ @pytest.mark.django_db
41
+ class TestNews:
42
+ @pytest.mark.parametrize("news__title", ["new1"])
43
+ def test_str(self, news):
44
+ assert str(news) == f"{news.title} ({news.source.title})"
45
+
46
+ def test_mark_as_deplicates_not_in_default_queryset(self, news):
47
+ assert set(News.objects.all()) == {news}
48
+
49
+ def test_get_default_guid(self):
50
+ assert News.get_default_guid("This is a title", None) == "this-is-a-title"
51
+ assert (
52
+ News.get_default_guid("This is a title", "http://mylink.com") == "http://mylink.com"
53
+ ) # link takes precendence
54
+ assert News.get_default_guid("a" * 24, None, max_length=20) == "a" * 20
55
+
56
+ def test_future_news(self, news_factory):
57
+ # ensure a future datetime always default to now
58
+ now = django_timezone.now()
59
+ future_news = news_factory.create(datetime=now + timedelta(days=1))
60
+ assert (future_news.datetime - now).seconds < 1 # we do that to account for clock difference
61
+
62
+ @patch("wbnews.models.news.detect_near_duplicates")
63
+ def test_handle_duplicates(self, mock_fct, news_factory):
64
+ val_date = fake.date_time(tzinfo=timezone.utc)
65
+ n0 = news_factory.create(
66
+ datetime=val_date - timedelta(days=1)
67
+ ) # we exclude this news from the duplicate search
68
+ n1 = news_factory.create(datetime=val_date)
69
+ n2 = news_factory.create(datetime=val_date)
70
+ n3 = news_factory.create(datetime=val_date)
71
+
72
+ mock_fct.return_value = [
73
+ n0.id,
74
+ n3.id,
75
+ ] # n0 is considered as duplicate but does not fall within the specified daterange so it will not be marked
76
+ News.handle_duplicates(val_date, val_date)
77
+
78
+ n3.refresh_from_db()
79
+ assert n3.mark_as_duplicate is True
80
+ assert set(News.objects.all()) == {n0, n1, n2}
@@ -0,0 +1,7 @@
1
+ from wbnews.models.utils import endpoint_to_author
2
+
3
+
4
+ def test_endpoint_to_author():
5
+ assert endpoint_to_author("test@test_test\\.com") == "Test Test"
6
+ assert endpoint_to_author("http://somesubdomain.domain.com") == "Domain"
7
+ assert endpoint_to_author("test") == "Test"
wbnews/tests/tests.py ADDED
@@ -0,0 +1,12 @@
1
+ import pytest
2
+ from wbcore.test import GenerateTest, default_config
3
+
4
+ config = {}
5
+ for key, value in default_config.items():
6
+ config[key] = list(filter(lambda x: x.__module__.startswith("wbnews"), value))
7
+
8
+
9
+ @pytest.mark.django_db
10
+ @GenerateTest(config)
11
+ class TestProject:
12
+ pass
wbnews/urls.py ADDED
@@ -0,0 +1,25 @@
1
+ from django.urls import include, path
2
+ from wbcore.routers import WBCoreRouter
3
+
4
+ from wbnews.viewsets import views
5
+
6
+ router = WBCoreRouter()
7
+ router.register(r"newsrepresentation", views.NewsRepresentationViewSet, basename="newsrepresentation")
8
+ router.register(r"newssourcerepresentation", views.SourceRepresentationViewSet, basename="sourcerepresentation")
9
+ router.register(r"news", views.NewsModelViewSet, basename="news")
10
+ router.register(r"newssource", views.SourceModelViewSet, basename="source")
11
+ router.register(r"newsrelationship", views.NewsRelationshipModelViewSet, basename="newsrelationship")
12
+
13
+
14
+ source_router = WBCoreRouter()
15
+ source_router.register(r"news", views.NewsSourceModelViewSet, basename="source-news")
16
+
17
+ urlpatterns = [
18
+ path("", include(router.urls)),
19
+ path("source/<int:source_id>/", include(source_router.urls)),
20
+ path(
21
+ "contentnews/<int:content_type>/<int:content_id>/",
22
+ views.NewsModelViewSet.as_view({"get": "list"}),
23
+ name="news_content_object",
24
+ ),
25
+ ]
wbnews/utils.py ADDED
@@ -0,0 +1,57 @@
1
+ import logging
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from django.utils.html import strip_tags
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ logger = logging.getLogger("news")
10
+
11
+
12
+ def _get_similarity_matrix_df(data: dict[int, str]) -> pd.DataFrame:
13
+ # Convert texts to TF-IDF vectors
14
+ ids, texts = zip(*data.items(), strict=False)
15
+ vectorizer = TfidfVectorizer()
16
+ tfidf_matrix = vectorizer.fit_transform(texts)
17
+ # Compute pairwise cosine similarity...
18
+ similarity_matrix = cosine_similarity(tfidf_matrix)
19
+ # convert the matrix into a proper dataframe
20
+ return pd.DataFrame(similarity_matrix, index=ids, columns=ids)
21
+
22
+
23
+ def detect_near_duplicates(data: dict[int, str], threshold: float = 0.9) -> list[int]:
24
+ """
25
+ Detects near-duplicate articles based on TF-IDF & Cosine Similarity.
26
+
27
+ Parameters:
28
+ - data (dict[int, str]): dictionary of new id with their respective content
29
+ - threshold (float): Similarity threshold (default = 0.9).
30
+
31
+ Returns:
32
+ - List of duplicated ids
33
+ """
34
+ if len(data.keys()) < 2:
35
+ return []
36
+ logger.info(f"Processing {len(data.keys())} news")
37
+ # Cleanup step
38
+ clean_data = {}
39
+ for _id, text in data.items():
40
+ clean_data[_id] = strip_tags(text)
41
+
42
+ # get similarity matrix
43
+ df = _get_similarity_matrix_df(data)
44
+
45
+ # Replace the lower matrix triangle with NaN
46
+ df = df.where(np.triu(np.ones(df.shape)).astype(bool))
47
+ # melt the symmetrical matrix into a key value store
48
+ df = df.stack().reset_index(name="value")
49
+ # remove duplicate pair with same id (expected to be 1.0)
50
+ df = df[df["level_0"] != df["level_1"]]
51
+ # get duplicates candidates
52
+ df = df[df["value"] > threshold]
53
+ # return only one side of the duplicate pair
54
+ duplicate_ids = df["level_1"].unique().tolist()
55
+ logger.info(f"{len(duplicate_ids)} duplicated news found")
56
+
57
+ return duplicate_ids
@@ -0,0 +1,12 @@
1
+ from .buttons import NewsButtonConfig
2
+ from .display import NewsDisplayConfig, NewsSourceDisplayConfig, SourceDisplayConfig
3
+ from .endpoints import NewsEndpointConfig, NewsSourceEndpointConfig, NewsRelationshipEndpointConfig
4
+ from .menu import NEWS_MENUITEM, NEWSSOURCE_MENUITEM
5
+ from .titles import NewsSourceModelTitleConfig, NewsTitleConfig, SourceModelTitleConfig
6
+ from .views import (
7
+ NewsModelViewSet,
8
+ NewsRepresentationViewSet,
9
+ SourceModelViewSet,
10
+ SourceRepresentationViewSet,
11
+ NewsRelationshipModelViewSet,
12
+ )