wbnews 1.46.12__py2.py3-none-any.whl → 1.60.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. wbnews/admin.py +4 -1
  2. wbnews/factories.py +7 -5
  3. wbnews/filters/__init__.py +1 -1
  4. wbnews/filters/news.py +39 -2
  5. wbnews/import_export/backends/news.py +3 -3
  6. wbnews/import_export/handlers/news.py +35 -3
  7. wbnews/import_export/parsers/emails/news.py +2 -11
  8. wbnews/import_export/parsers/emails/utils.py +16 -12
  9. wbnews/import_export/parsers/rss/news.py +3 -9
  10. wbnews/locale/de/LC_MESSAGES/django.mo +0 -0
  11. wbnews/locale/de/LC_MESSAGES/django.po +92 -39
  12. wbnews/locale/de/LC_MESSAGES/django.po.translated +173 -0
  13. wbnews/locale/en/LC_MESSAGES/django.mo +0 -0
  14. wbnews/locale/en/LC_MESSAGES/django.po +159 -0
  15. wbnews/locale/fr/LC_MESSAGES/django.mo +0 -0
  16. wbnews/locale/fr/LC_MESSAGES/django.po +161 -0
  17. wbnews/migrations/0012_alter_news_unique_together_news_identifier_and_more.py +91 -0
  18. wbnews/migrations/0013_alter_news_datetime.py +19 -0
  19. wbnews/migrations/0014_newsrelationship_unique_news_relationship.py +27 -0
  20. wbnews/models/llm/cleaned_news.py +26 -23
  21. wbnews/models/news.py +37 -22
  22. wbnews/models/relationships.py +20 -1
  23. wbnews/models/sources.py +35 -5
  24. wbnews/models/utils.py +15 -0
  25. wbnews/serializers.py +16 -7
  26. wbnews/tasks.py +17 -0
  27. wbnews/tests/parsers/__init__.py +0 -0
  28. wbnews/tests/parsers/test_emails.py +25 -0
  29. wbnews/tests/test_models.py +65 -0
  30. wbnews/tests/test_utils.py +7 -0
  31. wbnews/utils.py +57 -0
  32. wbnews/viewsets/display.py +25 -29
  33. wbnews/viewsets/endpoints.py +11 -6
  34. wbnews/viewsets/views.py +5 -4
  35. {wbnews-1.46.12.dist-info → wbnews-1.60.1.dist-info}/METADATA +1 -2
  36. wbnews-1.60.1.dist-info/RECORD +65 -0
  37. {wbnews-1.46.12.dist-info → wbnews-1.60.1.dist-info}/WHEEL +1 -1
  38. wbnews-1.46.12.dist-info/RECORD +0 -50
@@ -0,0 +1,159 @@
1
+ # SOME DESCRIPTIVE TITLE.
2
+ # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
3
+ # This file is distributed under the same license as the PACKAGE package.
4
+ # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
5
+ #
6
+ msgid ""
7
+ msgstr ""
8
+ "Project-Id-Version: PACKAGE VERSION\n"
9
+ "Report-Msgid-Bugs-To: \n"
10
+ "POT-Creation-Date: 2026-01-16 14:04+0100\n"
11
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12
+ "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13
+ "Language-Team: LANGUAGE <LL@li.org>\n"
14
+ "Language: \n"
15
+ "MIME-Version: 1.0\n"
16
+ "Content-Type: text/plain; charset=UTF-8\n"
17
+ "Content-Transfer-Encoding: 8bit\n"
18
+
19
+ #: models/news.py:52 viewsets/display.py:54 viewsets/display.py:100
20
+ #: viewsets/display.py:141
21
+ msgid "Datetime"
22
+ msgstr ""
23
+
24
+ #: models/news.py:53 serializers.py:21 serializers.py:76 viewsets/display.py:20
25
+ #: viewsets/display.py:55 viewsets/display.py:102 viewsets/display.py:142
26
+ msgid "Title"
27
+ msgstr ""
28
+
29
+ #: models/news.py:55 serializers.py:24 serializers.py:77 viewsets/display.py:23
30
+ #: viewsets/display.py:57 viewsets/display.py:104 viewsets/display.py:143
31
+ msgid "Description"
32
+ msgstr ""
33
+
34
+ #: models/news.py:56 serializers.py:78 viewsets/display.py:56
35
+ #: viewsets/display.py:103
36
+ msgid "Summary"
37
+ msgstr ""
38
+
39
+ #: models/news.py:57 viewsets/display.py:60 viewsets/display.py:144
40
+ msgid "Language"
41
+ msgstr ""
42
+
43
+ #: models/news.py:58
44
+ msgid "Link"
45
+ msgstr ""
46
+
47
+ #: models/news.py:62 viewsets/display.py:59 viewsets/display.py:106
48
+ msgid "Source"
49
+ msgstr ""
50
+
51
+ #: models/news.py:65
52
+ msgid "Mark as duplicate"
53
+ msgstr ""
54
+
55
+ #: models/relationships.py:9
56
+ msgid "Positive"
57
+ msgstr ""
58
+
59
+ #: models/relationships.py:10
60
+ msgid "Slightly Positive"
61
+ msgstr ""
62
+
63
+ #: models/relationships.py:11
64
+ msgid "Slightly Negative"
65
+ msgstr ""
66
+
67
+ #: models/relationships.py:12
68
+ msgid "Negative"
69
+ msgstr ""
70
+
71
+ #: serializers.py:22
72
+ msgid "Identifier"
73
+ msgstr ""
74
+
75
+ #: serializers.py:25 viewsets/display.py:22
76
+ msgid "Author"
77
+ msgstr ""
78
+
79
+ #: serializers.py:26
80
+ msgid "Updated"
81
+ msgstr ""
82
+
83
+ #: serializers.py:79
84
+ msgid "Date"
85
+ msgstr ""
86
+
87
+ #: serializers.py:92 viewsets/menu.py:6 viewsets/titles.py:24
88
+ msgid "News"
89
+ msgstr ""
90
+
91
+ #: viewsets/buttons.py:14
92
+ msgid "Open News"
93
+ msgstr ""
94
+
95
+ #: viewsets/buttons.py:21 viewsets/buttons.py:22 viewsets/buttons.py:24
96
+ msgid "Reset relationships"
97
+ msgstr ""
98
+
99
+ #: viewsets/display.py:21
100
+ msgid "RSS feed"
101
+ msgstr ""
102
+
103
+ #: viewsets/display.py:24
104
+ msgid "Last Update"
105
+ msgstr ""
106
+
107
+ #: viewsets/display.py:61
108
+ msgid "Image"
109
+ msgstr ""
110
+
111
+ #: viewsets/display.py:93
112
+ msgid "Linked Object"
113
+ msgstr ""
114
+
115
+ #: viewsets/display.py:101
116
+ msgid "Analysis"
117
+ msgstr ""
118
+
119
+ #: viewsets/display.py:105
120
+ msgid "Important"
121
+ msgstr ""
122
+
123
+ #: viewsets/menu.py:11
124
+ msgid "News Relationships"
125
+ msgstr ""
126
+
127
+ #: viewsets/menu.py:17 viewsets/titles.py:9
128
+ msgid "Sources"
129
+ msgstr ""
130
+
131
+ #: viewsets/menu.py:23
132
+ msgid "Create Source"
133
+ msgstr ""
134
+
135
+ #: viewsets/titles.py:13
136
+ #, python-brace-format
137
+ msgid "Source: {source}"
138
+ msgstr ""
139
+
140
+ #: viewsets/titles.py:14
141
+ msgid "News Source"
142
+ msgstr ""
143
+
144
+ #: viewsets/titles.py:19
145
+ msgid "News Flow"
146
+ msgstr ""
147
+
148
+ #: viewsets/titles.py:30
149
+ #, python-brace-format
150
+ msgid "News from {source}"
151
+ msgstr ""
152
+
153
+ #: viewsets/titles.py:36
154
+ msgid "News Article for {}"
155
+ msgstr ""
156
+
157
+ #: viewsets/titles.py:37 viewsets/titles.py:44
158
+ msgid "News Article"
159
+ msgstr ""
Binary file
@@ -0,0 +1,161 @@
1
+ # SOME DESCRIPTIVE TITLE.
2
+ # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
3
+ # This file is distributed under the same license as the PACKAGE package.
4
+ # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
5
+ #
6
+ msgid ""
7
+ msgstr ""
8
+ "Project-Id-Version: PACKAGE VERSION\n"
9
+ "Report-Msgid-Bugs-To: \n"
10
+ "POT-Creation-Date: 2026-01-16 14:04+0100\n"
11
+ "PO-Revision-Date: 2025-05-30 09:40+0000\n"
12
+ "Language-Team: French (https://app.transifex.com/stainly/teams/171242/fr/)\n"
13
+ "MIME-Version: 1.0\n"
14
+ "Content-Type: text/plain; charset=UTF-8\n"
15
+ "Content-Transfer-Encoding: 8bit\n"
16
+ "Language: fr\n"
17
+ "Plural-Forms: nplurals=3; plural=(n == 0 || n == 1) ? 0 : n != 0 && n % 1000000 == 0 ? 1 : 2;\n"
18
+
19
+ #: models/news.py:52 viewsets/display.py:54 viewsets/display.py:100
20
+ #: viewsets/display.py:141
21
+ msgid "Datetime"
22
+ msgstr ""
23
+
24
+ #: models/news.py:53 serializers.py:21 serializers.py:76
25
+ #: viewsets/display.py:20 viewsets/display.py:55 viewsets/display.py:102
26
+ #: viewsets/display.py:142
27
+ msgid "Title"
28
+ msgstr ""
29
+
30
+ #: models/news.py:55 serializers.py:24 serializers.py:77
31
+ #: viewsets/display.py:23 viewsets/display.py:57 viewsets/display.py:104
32
+ #: viewsets/display.py:143
33
+ msgid "Description"
34
+ msgstr ""
35
+
36
+ #: models/news.py:56 serializers.py:78 viewsets/display.py:56
37
+ #: viewsets/display.py:103
38
+ msgid "Summary"
39
+ msgstr ""
40
+
41
+ #: models/news.py:57 viewsets/display.py:60 viewsets/display.py:144
42
+ msgid "Language"
43
+ msgstr ""
44
+
45
+ #: models/news.py:58
46
+ msgid "Link"
47
+ msgstr ""
48
+
49
+ #: models/news.py:62 viewsets/display.py:59 viewsets/display.py:106
50
+ msgid "Source"
51
+ msgstr ""
52
+
53
+ #: models/news.py:65
54
+ msgid "Mark as duplicate"
55
+ msgstr ""
56
+
57
+ #: models/relationships.py:9
58
+ msgid "Positive"
59
+ msgstr ""
60
+
61
+ #: models/relationships.py:10
62
+ msgid "Slightly Positive"
63
+ msgstr ""
64
+
65
+ #: models/relationships.py:11
66
+ msgid "Slightly Negative"
67
+ msgstr ""
68
+
69
+ #: models/relationships.py:12
70
+ msgid "Negative"
71
+ msgstr ""
72
+
73
+ #: serializers.py:22
74
+ msgid "Identifier"
75
+ msgstr ""
76
+
77
+ #: serializers.py:25 viewsets/display.py:22
78
+ msgid "Author"
79
+ msgstr ""
80
+
81
+ #: serializers.py:26
82
+ msgid "Updated"
83
+ msgstr ""
84
+
85
+ #: serializers.py:79
86
+ msgid "Date"
87
+ msgstr ""
88
+
89
+ #: serializers.py:92 viewsets/menu.py:6 viewsets/titles.py:24
90
+ msgid "News"
91
+ msgstr ""
92
+
93
+ #: viewsets/buttons.py:14
94
+ msgid "Open News"
95
+ msgstr ""
96
+
97
+ #: viewsets/buttons.py:21 viewsets/buttons.py:22 viewsets/buttons.py:24
98
+ msgid "Reset relationships"
99
+ msgstr ""
100
+
101
+ #: viewsets/display.py:21
102
+ msgid "RSS feed"
103
+ msgstr ""
104
+
105
+ #: viewsets/display.py:24
106
+ msgid "Last Update"
107
+ msgstr ""
108
+
109
+ #: viewsets/display.py:61
110
+ msgid "Image"
111
+ msgstr ""
112
+
113
+ #: viewsets/display.py:93
114
+ msgid "Linked Object"
115
+ msgstr ""
116
+
117
+ #: viewsets/display.py:101
118
+ msgid "Analysis"
119
+ msgstr ""
120
+
121
+ #: viewsets/display.py:105
122
+ msgid "Important"
123
+ msgstr ""
124
+
125
+ #: viewsets/menu.py:11
126
+ msgid "News Relationships"
127
+ msgstr ""
128
+
129
+ #: viewsets/menu.py:17 viewsets/titles.py:9
130
+ msgid "Sources"
131
+ msgstr ""
132
+
133
+ #: viewsets/menu.py:23
134
+ msgid "Create Source"
135
+ msgstr ""
136
+
137
+ #: viewsets/titles.py:13
138
+ #, python-brace-format
139
+ msgid "Source: {source}"
140
+ msgstr ""
141
+
142
+ #: viewsets/titles.py:14
143
+ msgid "News Source"
144
+ msgstr ""
145
+
146
+ #: viewsets/titles.py:19
147
+ msgid "News Flow"
148
+ msgstr ""
149
+
150
+ #: viewsets/titles.py:30
151
+ #, python-brace-format
152
+ msgid "News from {source}"
153
+ msgstr ""
154
+
155
+ #: viewsets/titles.py:36
156
+ msgid "News Article for {}"
157
+ msgstr ""
158
+
159
+ #: viewsets/titles.py:37 viewsets/titles.py:44
160
+ msgid "News Article"
161
+ msgstr ""
@@ -0,0 +1,91 @@
1
+ # Generated by Django 5.0.12 on 2025-02-24 13:45
2
+ from collections import defaultdict
3
+
4
+ from django.core.exceptions import ValidationError
5
+ from django.core.validators import URLValidator
6
+ from django.db import migrations, models
7
+ from tqdm import tqdm
8
+
9
+
10
+ def migrate_identifier( apps, schema_editor):
11
+ from wbnews.models.news import News as NewsClass
12
+ News = apps.get_model('wbnews', 'News')
13
+ objs = []
14
+ qs = News.objects.all()
15
+ guid = set()
16
+ for new in tqdm(qs, total=qs.count()):
17
+ new.guid = NewsClass.get_default_guid(new.title, new.link)
18
+ if new.guid not in guid:
19
+ guid.add(new.guid)
20
+ objs.append(new)
21
+ print(len(objs))
22
+ News.objects.bulk_update(objs, ['guid'], batch_size=10000)
23
+
24
+ leftovers = News.objects.filter(guid__isnull=True)
25
+ print(leftovers.count())
26
+ leftovers.delete()
27
+
28
+ qs = News.objects.all()
29
+ for new in tqdm(qs, total=qs.count()):
30
+ try:
31
+ URLValidator()(new.link)
32
+ except ValidationError:
33
+ new.link = None
34
+ objs.append(new)
35
+
36
+ News.objects.bulk_update(objs, ['link'], batch_size=10000)
37
+
38
+ class Migration(migrations.Migration):
39
+
40
+ dependencies = [
41
+ ('io', '0008_importsource_resource_kwargs'),
42
+ ('wbnews', '0011_newsrelationship_content_object_repr'),
43
+ ]
44
+
45
+ operations = [
46
+ migrations.AlterUniqueTogether(
47
+ name='news',
48
+ unique_together=set(),
49
+ ),
50
+ migrations.AddField(
51
+ model_name='news',
52
+ name='guid',
53
+ field=models.CharField(default=None, blank=True, null=True, max_length=1024),
54
+ preserve_default=False,
55
+ ),
56
+ migrations.RunSQL(sql="SET CONSTRAINTS ALL IMMEDIATE;"),
57
+ migrations.RunPython(migrate_identifier),
58
+ migrations.RunSQL(sql="SET CONSTRAINTS ALL DEFERRED;"),
59
+ migrations.AlterField(
60
+ model_name='news',
61
+ name='guid',
62
+ field=models.CharField(max_length=1024, unique=True),
63
+ ),
64
+ migrations.AlterField(
65
+ model_name='news',
66
+ name='link',
67
+ field=models.URLField(max_length=1024, blank=True, null=True, verbose_name='Link'),
68
+ ),
69
+ migrations.AddField(
70
+ model_name='news',
71
+ name='mark_as_duplicate',
72
+ field=models.BooleanField(default=False, verbose_name='Mark as duplicate'),
73
+ ),
74
+ migrations.AlterField(
75
+ model_name='newsrelationship',
76
+ name='sentiment',
77
+ field=models.PositiveIntegerField(blank=True, choices=[(4, 'Positive'), (3, 'Slightly Positive'),
78
+ (2, 'Slightly Negative'), (1, 'Negative')],
79
+ null=True),
80
+ ),
81
+ migrations.RenameField(
82
+ model_name='newssource',
83
+ old_name='url',
84
+ new_name='endpoint',
85
+ ),
86
+ migrations.AlterField(
87
+ model_name='newssource',
88
+ name='endpoint',
89
+ field=models.CharField(max_length=1024, unique=True),
90
+ ),
91
+ ]
@@ -0,0 +1,19 @@
1
+ # Generated by Django 5.0.12 on 2025-04-03 08:08
2
+
3
+ import django.utils.timezone
4
+ from django.db import migrations, models
5
+
6
+
7
+ class Migration(migrations.Migration):
8
+
9
+ dependencies = [
10
+ ('wbnews', '0012_alter_news_unique_together_news_identifier_and_more'),
11
+ ]
12
+
13
+ operations = [
14
+ migrations.AlterField(
15
+ model_name='news',
16
+ name='datetime',
17
+ field=models.DateTimeField(default=django.utils.timezone.now, verbose_name='Datetime'),
18
+ ),
19
+ ]
@@ -0,0 +1,27 @@
1
+ # Generated by Django 5.0.14 on 2025-05-06 14:44
2
+
3
+ from django.db import migrations, models
4
+ from django.db.models import Count
5
+ from tqdm import tqdm
6
+
7
+ def handle_duplicated_relationship(apps, schema_editor):
8
+ NewsRelationship = apps.get_model("wbnews", "NewsRelationship")
9
+ qs = NewsRelationship.objects.values('content_type', 'object_id', 'news').annotate(c=Count('*')).filter(c__gt=1)
10
+ for row in tqdm(qs, total=qs.count()):
11
+ for rel in NewsRelationship.objects.filter(content_type=row['content_type'], news=row['news'], object_id=row["object_id"])[1:]:
12
+ rel.delete()
13
+
14
+ class Migration(migrations.Migration):
15
+
16
+ dependencies = [
17
+ ('contenttypes', '0002_remove_content_type_name'),
18
+ ('wbnews', '0013_alter_news_datetime'),
19
+ ]
20
+
21
+ operations = [
22
+ migrations.RunPython(handle_duplicated_relationship),
23
+ migrations.AddConstraint(
24
+ model_name='newsrelationship',
25
+ constraint=models.UniqueConstraint(fields=('content_type', 'object_id', 'news'), name='unique_news_relationship'),
26
+ ),
27
+ ]
@@ -5,27 +5,7 @@ from pydantic import BaseModel, Field
5
5
  from wbcore.contrib.ai.llm.config import LLMConfig
6
6
 
7
7
  if TYPE_CHECKING:
8
- from wbnews.models import News
9
-
10
-
11
- def clean_news_prompt(news: "News"):
12
- return [
13
- SystemMessage(
14
- content="I have an HTML email title and body, and I want to extract only the meaningful content in plain text format, removing all metadata, subscription links, and non-essential parts. The output should be HTML- and Markdown-free and should exclude any text related to links, subscription information, or common phrases like 'Unsubscribe' or 'View online'. Only retain the main email content but do not remove any information related to news."
15
- ),
16
- HumanMessage(
17
- content=f"Title: {news.title}\n\nDescription: {news.description}",
18
- ),
19
- ]
20
-
21
-
22
- def summarize_news_prompt(news: "News"):
23
- return [
24
- SystemMessage(content="Given this news description, please extract a short summary"),
25
- HumanMessage(
26
- content=f"Description: {news.description}",
27
- ),
28
- ]
8
+ from wbnews.models import News # noqa
29
9
 
30
10
 
31
11
  class CleanNewsModel(BaseModel):
@@ -46,18 +26,41 @@ class SummarizedNewsModel(BaseModel):
46
26
  )
47
27
 
48
28
 
29
+ def get_clean_news_config_query(instance):
30
+ return {"description": instance.description, "title": instance.title}
31
+
32
+
49
33
  clean_news_config = LLMConfig["News"](
50
34
  key="clean",
51
35
  output_model=CleanNewsModel,
52
- prompt=clean_news_prompt,
36
+ prompt=[
37
+ SystemMessage(
38
+ content="I have an HTML email title and body, and I want to extract only the meaningful content in plain text format, removing all metadata, subscription links, and non-essential parts. The output should be HTML- and Markdown-free and should exclude any text related to links, subscription information, or common phrases like 'Unsubscribe' or 'View online'. Only retain the main email content but do not remove any information related to news."
39
+ ),
40
+ HumanMessage(
41
+ content="Title: {title}\n\nDescription: {description}",
42
+ ),
43
+ ],
53
44
  on_save=True,
54
45
  on_condition=lambda n: n.source.clean_content,
46
+ query=get_clean_news_config_query,
55
47
  )
56
48
 
49
+
50
+ def get_summarized_news_config_query(instance):
51
+ return {"description": instance.description}
52
+
53
+
57
54
  summarized_news_config = LLMConfig["News"](
58
55
  key="summarize",
59
56
  output_model=SummarizedNewsModel,
60
- prompt=summarize_news_prompt,
57
+ prompt=[
58
+ SystemMessage(content="Given this news description, please extract a short summary"),
59
+ HumanMessage(
60
+ content="Description: {description}",
61
+ ),
62
+ ],
61
63
  on_save=True,
62
64
  on_condition=lambda n: not n.summary,
65
+ query=get_summarized_news_config_query,
63
66
  )
wbnews/models/news.py CHANGED
@@ -1,3 +1,4 @@
1
+ from datetime import date
1
2
  from typing import Any
2
3
 
3
4
  from celery import chord, shared_task
@@ -7,19 +8,23 @@ from django.contrib.postgres.fields import ArrayField
7
8
  from django.db import models
8
9
  from django.db.models.signals import post_save
9
10
  from django.dispatch import receiver
11
+ from django.utils import timezone
10
12
  from django.utils.translation import gettext_lazy as _
13
+ from slugify import slugify
11
14
  from wbcore.contrib.ai.llm.decorators import llm
12
15
  from wbcore.contrib.io.mixins import ImportMixin
13
16
  from wbcore.models import WBModel
17
+ from wbcore.workers import Queue
14
18
 
15
19
  from wbnews.import_export.handlers.news import NewsImportHandler
16
20
  from wbnews.models.llm.cleaned_news import clean_news_config, summarized_news_config
17
21
  from wbnews.models.relationships import NewsRelationship
18
- from wbnews.models.sources import NewsSource
19
22
  from wbnews.signals import create_news_relationships
20
23
 
24
+ from ..utils import detect_near_duplicates
21
25
 
22
- @shared_task
26
+
27
+ @shared_task(queue=Queue.DEFAULT.value)
23
28
  def create_relationship(chain_results: list[list[dict[str, Any]]], news_id: int):
24
29
  objs = []
25
30
  for relationships in chain_results:
@@ -32,6 +37,11 @@ def create_relationship(chain_results: list[list[dict[str, Any]]], news_id: int)
32
37
  )
33
38
 
34
39
 
40
+ class DefaultObjectManager(models.Manager):
41
+ def get_queryset(self):
42
+ return super().get_queryset().filter(mark_as_duplicate=False)
43
+
44
+
35
45
  @llm([clean_news_config, summarized_news_config])
36
46
  class News(ImportMixin, WBModel):
37
47
  errors = {
@@ -39,21 +49,29 @@ class News(ImportMixin, WBModel):
39
49
  }
40
50
  import_export_handler_class = NewsImportHandler
41
51
 
42
- datetime = models.DateTimeField(verbose_name=_("Datetime"))
52
+ datetime = models.DateTimeField(verbose_name=_("Datetime"), default=timezone.now)
43
53
  title = models.CharField(max_length=500, verbose_name=_("Title"))
54
+ guid = models.CharField(max_length=1024, unique=True)
44
55
  description = models.TextField(blank=True, verbose_name=_("Description"))
45
56
  summary = models.TextField(blank=True, verbose_name=_("Summary"))
46
57
  language = models.CharField(max_length=16, choices=LANGUAGES, blank=True, null=True, verbose_name=_("Language"))
47
- link = models.CharField(max_length=500, blank=True, null=True, verbose_name=_("Link"))
58
+ link = models.URLField(max_length=1024, blank=True, null=True, verbose_name=_("Link"))
48
59
  tags = ArrayField(models.CharField(max_length=16), default=list)
49
60
  enclosures = ArrayField(models.URLField(), default=list)
50
61
  source = models.ForeignKey(
51
62
  "wbnews.NewsSource", on_delete=models.CASCADE, related_name="news", verbose_name=_("Source")
52
63
  )
53
64
  image_url = models.URLField(blank=True, null=True)
65
+ mark_as_duplicate = models.BooleanField(default=False, verbose_name=_("Mark as duplicate"))
54
66
 
55
- class Meta:
56
- unique_together = ["title", "source", "datetime"]
67
+ objects = DefaultObjectManager()
68
+ all_objects = models.Manager()
69
+
70
+ def save(self, *args, **kwargs):
71
+ self.datetime = min(self.datetime, timezone.now()) # we ensure a news is never in the future
72
+ if self.guid is None:
73
+ self.guid = self.get_default_guid(self.title, self.link)
74
+ super().save(*args, **kwargs)
57
75
 
58
76
  def __str__(self) -> str:
59
77
  return f"{self.title} ({self.source.title})"
@@ -64,7 +82,8 @@ class News(ImportMixin, WBModel):
64
82
  """
65
83
  tasks = []
66
84
  for sender, task_signature in create_news_relationships.send(sender=News, instance=self):
67
- assert isinstance(task_signature, Signature), self.errors["relationship_signal"].format(sender)
85
+ if not isinstance(task_signature, Signature):
86
+ raise AssertionError(self.errors["relationship_signal"].format(sender))
68
87
  tasks.append(task_signature)
69
88
  if tasks:
70
89
  res = chord(tasks, create_relationship.s(self.id))
@@ -73,22 +92,11 @@ class News(ImportMixin, WBModel):
73
92
  else:
74
93
  res.apply_async()
75
94
 
76
- # TODO: Consider moving this into a get_or_create queryset method on NewsSource?
77
95
  @classmethod
78
- def source_dict_to_model(cls, data: dict) -> NewsSource:
79
- sources = NewsSource.objects
80
- if "id" in data:
81
- return sources.get(id=data["id"])
82
- if identifier := data.get("identifier"):
83
- sources = sources.filter(identifier=identifier)
84
- elif url := data.get("url"):
85
- sources = sources.filter(url=url)
86
- elif title := data.get("title"):
87
- sources = sources.filter(title=title)
88
- if sources.count() == 1:
89
- return sources.first()
90
- else:
91
- return NewsSource.objects.create(**data)
96
+ def get_default_guid(cls, title: str, link: str | None, max_length: int = 1024) -> str:
97
+ if link:
98
+ return link
99
+ return slugify(title)[0:max_length]
92
100
 
93
101
  @classmethod
94
102
  def get_representation_endpoint(cls) -> str:
@@ -106,6 +114,13 @@ class News(ImportMixin, WBModel):
106
114
  def get_endpoint_basename(cls) -> str:
107
115
  return "wbnews:news"
108
116
 
117
+ @classmethod
118
+ def handle_duplicates(cls, start: date, end: date, content_label: str = "description", threshold: float = 0.9):
119
+ qs = News.objects.filter(datetime__gte=start, datetime__lte=end)
120
+ data = dict(qs.values_list("id", content_label))
121
+ duplicate_ids = detect_near_duplicates(data, threshold=threshold)
122
+ qs.filter(id__in=duplicate_ids).update(mark_as_duplicate=True)
123
+
109
124
 
110
125
  @receiver(post_save, sender="wbnews.News")
111
126
  def post_save_create_news_relationships(sender: type, instance: "News", raw: bool, created: bool, **kwargs):