wbnews 1.45.0__py2.py3-none-any.whl → 1.58.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. wbnews/admin.py +4 -1
  2. wbnews/factories.py +7 -5
  3. wbnews/filters/__init__.py +1 -1
  4. wbnews/filters/news.py +39 -2
  5. wbnews/import_export/backends/news.py +3 -3
  6. wbnews/import_export/handlers/news.py +35 -3
  7. wbnews/import_export/parsers/emails/news.py +0 -9
  8. wbnews/import_export/parsers/emails/utils.py +16 -12
  9. wbnews/import_export/parsers/rss/news.py +3 -9
  10. wbnews/locale/de/LC_MESSAGES/django.mo +0 -0
  11. wbnews/locale/de/LC_MESSAGES/django.po +93 -39
  12. wbnews/locale/de/LC_MESSAGES/django.po.translated +173 -0
  13. wbnews/locale/en/LC_MESSAGES/django.mo +0 -0
  14. wbnews/locale/en/LC_MESSAGES/django.po +159 -0
  15. wbnews/locale/fr/LC_MESSAGES/django.mo +0 -0
  16. wbnews/locale/fr/LC_MESSAGES/django.po +162 -0
  17. wbnews/migrations/0011_newsrelationship_content_object_repr.py +18 -0
  18. wbnews/migrations/0012_alter_news_unique_together_news_identifier_and_more.py +91 -0
  19. wbnews/migrations/0013_alter_news_datetime.py +19 -0
  20. wbnews/migrations/0014_newsrelationship_unique_news_relationship.py +27 -0
  21. wbnews/models/llm/cleaned_news.py +26 -23
  22. wbnews/models/news.py +35 -21
  23. wbnews/models/relationships.py +25 -1
  24. wbnews/models/sources.py +35 -5
  25. wbnews/models/utils.py +15 -0
  26. wbnews/serializers.py +51 -5
  27. wbnews/tasks.py +16 -0
  28. wbnews/tests/parsers/__init__.py +0 -0
  29. wbnews/tests/parsers/test_emails.py +25 -0
  30. wbnews/tests/test_models.py +65 -0
  31. wbnews/tests/test_utils.py +7 -0
  32. wbnews/urls.py +0 -5
  33. wbnews/utils.py +57 -0
  34. wbnews/viewsets/__init__.py +1 -1
  35. wbnews/viewsets/buttons.py +21 -2
  36. wbnews/viewsets/display.py +34 -21
  37. wbnews/viewsets/endpoints.py +22 -6
  38. wbnews/viewsets/menu.py +6 -0
  39. wbnews/viewsets/titles.py +5 -1
  40. wbnews/viewsets/views.py +48 -23
  41. {wbnews-1.45.0.dist-info → wbnews-1.58.1.dist-info}/METADATA +1 -2
  42. wbnews-1.58.1.dist-info/RECORD +65 -0
  43. wbnews-1.45.0.dist-info/RECORD +0 -49
  44. {wbnews-1.45.0.dist-info → wbnews-1.58.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,159 @@
1
+ # SOME DESCRIPTIVE TITLE.
2
+ # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
3
+ # This file is distributed under the same license as the PACKAGE package.
4
+ # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
5
+ #
6
+ msgid ""
7
+ msgstr ""
8
+ "Project-Id-Version: PACKAGE VERSION\n"
9
+ "Report-Msgid-Bugs-To: \n"
10
+ "POT-Creation-Date: 2025-05-30 11:37+0200\n"
11
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12
+ "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13
+ "Language-Team: LANGUAGE <LL@li.org>\n"
14
+ "Language: \n"
15
+ "MIME-Version: 1.0\n"
16
+ "Content-Type: text/plain; charset=UTF-8\n"
17
+ "Content-Transfer-Encoding: 8bit\n"
18
+
19
+ #: models/news.py:51 viewsets/display.py:54 viewsets/display.py:100
20
+ #: viewsets/display.py:141
21
+ msgid "Datetime"
22
+ msgstr ""
23
+
24
+ #: models/news.py:52 serializers.py:21 serializers.py:76 viewsets/display.py:20
25
+ #: viewsets/display.py:55 viewsets/display.py:102 viewsets/display.py:142
26
+ msgid "Title"
27
+ msgstr ""
28
+
29
+ #: models/news.py:54 serializers.py:24 serializers.py:77 viewsets/display.py:23
30
+ #: viewsets/display.py:57 viewsets/display.py:104 viewsets/display.py:143
31
+ msgid "Description"
32
+ msgstr ""
33
+
34
+ #: models/news.py:55 serializers.py:78 viewsets/display.py:56
35
+ #: viewsets/display.py:103
36
+ msgid "Summary"
37
+ msgstr ""
38
+
39
+ #: models/news.py:56 viewsets/display.py:60 viewsets/display.py:144
40
+ msgid "Language"
41
+ msgstr ""
42
+
43
+ #: models/news.py:57
44
+ msgid "Link"
45
+ msgstr ""
46
+
47
+ #: models/news.py:61 viewsets/display.py:59 viewsets/display.py:106
48
+ msgid "Source"
49
+ msgstr ""
50
+
51
+ #: models/news.py:64
52
+ msgid "Mark as duplicate"
53
+ msgstr ""
54
+
55
+ #: models/relationships.py:9
56
+ msgid "Positive"
57
+ msgstr ""
58
+
59
+ #: models/relationships.py:10
60
+ msgid "Slightly Positive"
61
+ msgstr ""
62
+
63
+ #: models/relationships.py:11
64
+ msgid "Slightly Negative"
65
+ msgstr ""
66
+
67
+ #: models/relationships.py:12
68
+ msgid "Negative"
69
+ msgstr ""
70
+
71
+ #: serializers.py:22
72
+ msgid "Identifier"
73
+ msgstr ""
74
+
75
+ #: serializers.py:25 viewsets/display.py:22
76
+ msgid "Author"
77
+ msgstr ""
78
+
79
+ #: serializers.py:26
80
+ msgid "Updated"
81
+ msgstr ""
82
+
83
+ #: serializers.py:79
84
+ msgid "Date"
85
+ msgstr ""
86
+
87
+ #: serializers.py:92 viewsets/menu.py:6 viewsets/titles.py:24
88
+ msgid "News"
89
+ msgstr ""
90
+
91
+ #: viewsets/buttons.py:14
92
+ msgid "Open News"
93
+ msgstr ""
94
+
95
+ #: viewsets/buttons.py:21 viewsets/buttons.py:22 viewsets/buttons.py:24
96
+ msgid "Reset relationships"
97
+ msgstr ""
98
+
99
+ #: viewsets/display.py:21
100
+ msgid "RSS feed"
101
+ msgstr ""
102
+
103
+ #: viewsets/display.py:24
104
+ msgid "Last Update"
105
+ msgstr ""
106
+
107
+ #: viewsets/display.py:61
108
+ msgid "Image"
109
+ msgstr ""
110
+
111
+ #: viewsets/display.py:93
112
+ msgid "Linked Object"
113
+ msgstr ""
114
+
115
+ #: viewsets/display.py:101
116
+ msgid "Analysis"
117
+ msgstr ""
118
+
119
+ #: viewsets/display.py:105
120
+ msgid "Important"
121
+ msgstr ""
122
+
123
+ #: viewsets/menu.py:11
124
+ msgid "News Relationships"
125
+ msgstr ""
126
+
127
+ #: viewsets/menu.py:17 viewsets/titles.py:9
128
+ msgid "Sources"
129
+ msgstr ""
130
+
131
+ #: viewsets/menu.py:23
132
+ msgid "Create Source"
133
+ msgstr ""
134
+
135
+ #: viewsets/titles.py:13
136
+ #, python-brace-format
137
+ msgid "Source: {source}"
138
+ msgstr ""
139
+
140
+ #: viewsets/titles.py:14
141
+ msgid "News Source"
142
+ msgstr ""
143
+
144
+ #: viewsets/titles.py:19
145
+ msgid "News Flow"
146
+ msgstr ""
147
+
148
+ #: viewsets/titles.py:30
149
+ #, python-brace-format
150
+ msgid "News from {source}"
151
+ msgstr ""
152
+
153
+ #: viewsets/titles.py:36
154
+ msgid "News Article for {}"
155
+ msgstr ""
156
+
157
+ #: viewsets/titles.py:37 viewsets/titles.py:44
158
+ msgid "News Article"
159
+ msgstr ""
Binary file
@@ -0,0 +1,162 @@
1
+ # SOME DESCRIPTIVE TITLE.
2
+ # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
3
+ # This file is distributed under the same license as the PACKAGE package.
4
+ # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
5
+ #
6
+ #, fuzzy
7
+ msgid ""
8
+ msgstr ""
9
+ "Project-Id-Version: PACKAGE VERSION\n"
10
+ "Report-Msgid-Bugs-To: \n"
11
+ "POT-Creation-Date: 2025-05-30 11:37+0200\n"
12
+ "PO-Revision-Date: 2025-05-30 09:40+0000\n"
13
+ "Language-Team: French (https://app.transifex.com/stainly/teams/171242/fr/)\n"
14
+ "MIME-Version: 1.0\n"
15
+ "Content-Type: text/plain; charset=UTF-8\n"
16
+ "Content-Transfer-Encoding: 8bit\n"
17
+ "Language: fr\n"
18
+ "Plural-Forms: nplurals=3; plural=(n == 0 || n == 1) ? 0 : n != 0 && n % 1000000 == 0 ? 1 : 2;\n"
19
+
20
+ #: models/news.py:51 viewsets/display.py:54 viewsets/display.py:100
21
+ #: viewsets/display.py:141
22
+ msgid "Datetime"
23
+ msgstr ""
24
+
25
+ #: models/news.py:52 serializers.py:21 serializers.py:76
26
+ #: viewsets/display.py:20 viewsets/display.py:55 viewsets/display.py:102
27
+ #: viewsets/display.py:142
28
+ msgid "Title"
29
+ msgstr ""
30
+
31
+ #: models/news.py:54 serializers.py:24 serializers.py:77
32
+ #: viewsets/display.py:23 viewsets/display.py:57 viewsets/display.py:104
33
+ #: viewsets/display.py:143
34
+ msgid "Description"
35
+ msgstr ""
36
+
37
+ #: models/news.py:55 serializers.py:78 viewsets/display.py:56
38
+ #: viewsets/display.py:103
39
+ msgid "Summary"
40
+ msgstr ""
41
+
42
+ #: models/news.py:56 viewsets/display.py:60 viewsets/display.py:144
43
+ msgid "Language"
44
+ msgstr ""
45
+
46
+ #: models/news.py:57
47
+ msgid "Link"
48
+ msgstr ""
49
+
50
+ #: models/news.py:61 viewsets/display.py:59 viewsets/display.py:106
51
+ msgid "Source"
52
+ msgstr ""
53
+
54
+ #: models/news.py:64
55
+ msgid "Mark as duplicate"
56
+ msgstr ""
57
+
58
+ #: models/relationships.py:9
59
+ msgid "Positive"
60
+ msgstr ""
61
+
62
+ #: models/relationships.py:10
63
+ msgid "Slightly Positive"
64
+ msgstr ""
65
+
66
+ #: models/relationships.py:11
67
+ msgid "Slightly Negative"
68
+ msgstr ""
69
+
70
+ #: models/relationships.py:12
71
+ msgid "Negative"
72
+ msgstr ""
73
+
74
+ #: serializers.py:22
75
+ msgid "Identifier"
76
+ msgstr ""
77
+
78
+ #: serializers.py:25 viewsets/display.py:22
79
+ msgid "Author"
80
+ msgstr ""
81
+
82
+ #: serializers.py:26
83
+ msgid "Updated"
84
+ msgstr ""
85
+
86
+ #: serializers.py:79
87
+ msgid "Date"
88
+ msgstr ""
89
+
90
+ #: serializers.py:92 viewsets/menu.py:6 viewsets/titles.py:24
91
+ msgid "News"
92
+ msgstr ""
93
+
94
+ #: viewsets/buttons.py:14
95
+ msgid "Open News"
96
+ msgstr ""
97
+
98
+ #: viewsets/buttons.py:21 viewsets/buttons.py:22 viewsets/buttons.py:24
99
+ msgid "Reset relationships"
100
+ msgstr ""
101
+
102
+ #: viewsets/display.py:21
103
+ msgid "RSS feed"
104
+ msgstr ""
105
+
106
+ #: viewsets/display.py:24
107
+ msgid "Last Update"
108
+ msgstr ""
109
+
110
+ #: viewsets/display.py:61
111
+ msgid "Image"
112
+ msgstr ""
113
+
114
+ #: viewsets/display.py:93
115
+ msgid "Linked Object"
116
+ msgstr ""
117
+
118
+ #: viewsets/display.py:101
119
+ msgid "Analysis"
120
+ msgstr ""
121
+
122
+ #: viewsets/display.py:105
123
+ msgid "Important"
124
+ msgstr ""
125
+
126
+ #: viewsets/menu.py:11
127
+ msgid "News Relationships"
128
+ msgstr ""
129
+
130
+ #: viewsets/menu.py:17 viewsets/titles.py:9
131
+ msgid "Sources"
132
+ msgstr ""
133
+
134
+ #: viewsets/menu.py:23
135
+ msgid "Create Source"
136
+ msgstr ""
137
+
138
+ #: viewsets/titles.py:13
139
+ #, python-brace-format
140
+ msgid "Source: {source}"
141
+ msgstr ""
142
+
143
+ #: viewsets/titles.py:14
144
+ msgid "News Source"
145
+ msgstr ""
146
+
147
+ #: viewsets/titles.py:19
148
+ msgid "News Flow"
149
+ msgstr ""
150
+
151
+ #: viewsets/titles.py:30
152
+ #, python-brace-format
153
+ msgid "News from {source}"
154
+ msgstr ""
155
+
156
+ #: viewsets/titles.py:36
157
+ msgid "News Article for {}"
158
+ msgstr ""
159
+
160
+ #: viewsets/titles.py:37 viewsets/titles.py:44
161
+ msgid "News Article"
162
+ msgstr ""
@@ -0,0 +1,18 @@
1
+ # Generated by Django 5.0.12 on 2025-02-13 10:08
2
+
3
+ from django.db import migrations, models
4
+
5
+
6
+ class Migration(migrations.Migration):
7
+
8
+ dependencies = [
9
+ ('wbnews', '0010_newsrelationship_important'),
10
+ ]
11
+
12
+ operations = [
13
+ migrations.AddField(
14
+ model_name='newsrelationship',
15
+ name='content_object_repr',
16
+ field=models.CharField(default='', max_length=512),
17
+ ),
18
+ ]
@@ -0,0 +1,91 @@
1
+ # Generated by Django 5.0.12 on 2025-02-24 13:45
2
+ from collections import defaultdict
3
+
4
+ from django.core.exceptions import ValidationError
5
+ from django.core.validators import URLValidator
6
+ from django.db import migrations, models
7
+ from tqdm import tqdm
8
+
9
+
10
+ def migrate_identifier( apps, schema_editor):
11
+ from wbnews.models.news import News as NewsClass
12
+ News = apps.get_model('wbnews', 'News')
13
+ objs = []
14
+ qs = News.objects.all()
15
+ guid = set()
16
+ for new in tqdm(qs, total=qs.count()):
17
+ new.guid = NewsClass.get_default_guid(new.title, new.link)
18
+ if new.guid not in guid:
19
+ guid.add(new.guid)
20
+ objs.append(new)
21
+ print(len(objs))
22
+ News.objects.bulk_update(objs, ['guid'], batch_size=10000)
23
+
24
+ leftovers = News.objects.filter(guid__isnull=True)
25
+ print(leftovers.count())
26
+ leftovers.delete()
27
+
28
+ qs = News.objects.all()
29
+ for new in tqdm(qs, total=qs.count()):
30
+ try:
31
+ URLValidator()(new.link)
32
+ except ValidationError:
33
+ new.link = None
34
+ objs.append(new)
35
+
36
+ News.objects.bulk_update(objs, ['link'], batch_size=10000)
37
+
38
+ class Migration(migrations.Migration):
39
+
40
+ dependencies = [
41
+ ('io', '0008_importsource_resource_kwargs'),
42
+ ('wbnews', '0011_newsrelationship_content_object_repr'),
43
+ ]
44
+
45
+ operations = [
46
+ migrations.AlterUniqueTogether(
47
+ name='news',
48
+ unique_together=set(),
49
+ ),
50
+ migrations.AddField(
51
+ model_name='news',
52
+ name='guid',
53
+ field=models.CharField(default=None, blank=True, null=True, max_length=1024),
54
+ preserve_default=False,
55
+ ),
56
+ migrations.RunSQL(sql="SET CONSTRAINTS ALL IMMEDIATE;"),
57
+ migrations.RunPython(migrate_identifier),
58
+ migrations.RunSQL(sql="SET CONSTRAINTS ALL DEFERRED;"),
59
+ migrations.AlterField(
60
+ model_name='news',
61
+ name='guid',
62
+ field=models.CharField(max_length=1024, unique=True),
63
+ ),
64
+ migrations.AlterField(
65
+ model_name='news',
66
+ name='link',
67
+ field=models.URLField(max_length=1024, blank=True, null=True, verbose_name='Link'),
68
+ ),
69
+ migrations.AddField(
70
+ model_name='news',
71
+ name='mark_as_duplicate',
72
+ field=models.BooleanField(default=False, verbose_name='Mark as duplicate'),
73
+ ),
74
+ migrations.AlterField(
75
+ model_name='newsrelationship',
76
+ name='sentiment',
77
+ field=models.PositiveIntegerField(blank=True, choices=[(4, 'Positive'), (3, 'Slightly Positive'),
78
+ (2, 'Slightly Negative'), (1, 'Negative')],
79
+ null=True),
80
+ ),
81
+ migrations.RenameField(
82
+ model_name='newssource',
83
+ old_name='url',
84
+ new_name='endpoint',
85
+ ),
86
+ migrations.AlterField(
87
+ model_name='newssource',
88
+ name='endpoint',
89
+ field=models.CharField(max_length=1024, unique=True),
90
+ ),
91
+ ]
@@ -0,0 +1,19 @@
1
+ # Generated by Django 5.0.12 on 2025-04-03 08:08
2
+
3
+ import django.utils.timezone
4
+ from django.db import migrations, models
5
+
6
+
7
+ class Migration(migrations.Migration):
8
+
9
+ dependencies = [
10
+ ('wbnews', '0012_alter_news_unique_together_news_identifier_and_more'),
11
+ ]
12
+
13
+ operations = [
14
+ migrations.AlterField(
15
+ model_name='news',
16
+ name='datetime',
17
+ field=models.DateTimeField(default=django.utils.timezone.now, verbose_name='Datetime'),
18
+ ),
19
+ ]
@@ -0,0 +1,27 @@
1
+ # Generated by Django 5.0.14 on 2025-05-06 14:44
2
+
3
+ from django.db import migrations, models
4
+ from django.db.models import Count
5
+ from tqdm import tqdm
6
+
7
+ def handle_duplicated_relationship(apps, schema_editor):
8
+ NewsRelationship = apps.get_model("wbnews", "NewsRelationship")
9
+ qs = NewsRelationship.objects.values('content_type', 'object_id', 'news').annotate(c=Count('*')).filter(c__gt=1)
10
+ for row in tqdm(qs, total=qs.count()):
11
+ for rel in NewsRelationship.objects.filter(content_type=row['content_type'], news=row['news'], object_id=row["object_id"])[1:]:
12
+ rel.delete()
13
+
14
+ class Migration(migrations.Migration):
15
+
16
+ dependencies = [
17
+ ('contenttypes', '0002_remove_content_type_name'),
18
+ ('wbnews', '0013_alter_news_datetime'),
19
+ ]
20
+
21
+ operations = [
22
+ migrations.RunPython(handle_duplicated_relationship),
23
+ migrations.AddConstraint(
24
+ model_name='newsrelationship',
25
+ constraint=models.UniqueConstraint(fields=('content_type', 'object_id', 'news'), name='unique_news_relationship'),
26
+ ),
27
+ ]
@@ -5,27 +5,7 @@ from pydantic import BaseModel, Field
5
5
  from wbcore.contrib.ai.llm.config import LLMConfig
6
6
 
7
7
  if TYPE_CHECKING:
8
- from wbnews.models import News
9
-
10
-
11
- def clean_news_prompt(news: "News"):
12
- return [
13
- SystemMessage(
14
- content="I have an HTML email title and body, and I want to extract only the meaningful content in plain text format, removing all metadata, subscription links, and non-essential parts. The output should be HTML- and Markdown-free and should exclude any text related to links, subscription information, or common phrases like 'Unsubscribe' or 'View online'. Only retain the main email content but do not remove any information related to news."
15
- ),
16
- HumanMessage(
17
- content=f"Title: {news.title}\n\nDescription: {news.description}",
18
- ),
19
- ]
20
-
21
-
22
- def summarize_news_prompt(news: "News"):
23
- return [
24
- SystemMessage(content="Given this news description, please extract a short summary"),
25
- HumanMessage(
26
- content=f"Description: {news.description}",
27
- ),
28
- ]
8
+ from wbnews.models import News # noqa
29
9
 
30
10
 
31
11
  class CleanNewsModel(BaseModel):
@@ -46,18 +26,41 @@ class SummarizedNewsModel(BaseModel):
46
26
  )
47
27
 
48
28
 
29
+ def get_clean_news_config_query(instance):
30
+ return {"description": instance.description, "title": instance.title}
31
+
32
+
49
33
  clean_news_config = LLMConfig["News"](
50
34
  key="clean",
51
35
  output_model=CleanNewsModel,
52
- prompt=clean_news_prompt,
36
+ prompt=[
37
+ SystemMessage(
38
+ content="I have an HTML email title and body, and I want to extract only the meaningful content in plain text format, removing all metadata, subscription links, and non-essential parts. The output should be HTML- and Markdown-free and should exclude any text related to links, subscription information, or common phrases like 'Unsubscribe' or 'View online'. Only retain the main email content but do not remove any information related to news."
39
+ ),
40
+ HumanMessage(
41
+ content="Title: {title}\n\nDescription: {description}",
42
+ ),
43
+ ],
53
44
  on_save=True,
54
45
  on_condition=lambda n: n.source.clean_content,
46
+ query=get_clean_news_config_query,
55
47
  )
56
48
 
49
+
50
+ def get_summarized_news_config_query(instance):
51
+ return {"description": instance.description}
52
+
53
+
57
54
  summarized_news_config = LLMConfig["News"](
58
55
  key="summarize",
59
56
  output_model=SummarizedNewsModel,
60
- prompt=summarize_news_prompt,
57
+ prompt=[
58
+ SystemMessage(content="Given this news description, please extract a short summary"),
59
+ HumanMessage(
60
+ content="Description: {description}",
61
+ ),
62
+ ],
61
63
  on_save=True,
62
64
  on_condition=lambda n: not n.summary,
65
+ query=get_summarized_news_config_query,
63
66
  )
wbnews/models/news.py CHANGED
@@ -1,3 +1,4 @@
1
+ from datetime import date
1
2
  from typing import Any
2
3
 
3
4
  from celery import chord, shared_task
@@ -7,7 +8,9 @@ from django.contrib.postgres.fields import ArrayField
7
8
  from django.db import models
8
9
  from django.db.models.signals import post_save
9
10
  from django.dispatch import receiver
11
+ from django.utils import timezone
10
12
  from django.utils.translation import gettext_lazy as _
13
+ from slugify import slugify
11
14
  from wbcore.contrib.ai.llm.decorators import llm
12
15
  from wbcore.contrib.io.mixins import ImportMixin
13
16
  from wbcore.models import WBModel
@@ -15,9 +18,10 @@ from wbcore.models import WBModel
15
18
  from wbnews.import_export.handlers.news import NewsImportHandler
16
19
  from wbnews.models.llm.cleaned_news import clean_news_config, summarized_news_config
17
20
  from wbnews.models.relationships import NewsRelationship
18
- from wbnews.models.sources import NewsSource
19
21
  from wbnews.signals import create_news_relationships
20
22
 
23
+ from ..utils import detect_near_duplicates
24
+
21
25
 
22
26
  @shared_task
23
27
  def create_relationship(chain_results: list[list[dict[str, Any]]], news_id: int):
@@ -32,6 +36,11 @@ def create_relationship(chain_results: list[list[dict[str, Any]]], news_id: int)
32
36
  )
33
37
 
34
38
 
39
+ class DefaultObjectManager(models.Manager):
40
+ def get_queryset(self):
41
+ return super().get_queryset().filter(mark_as_duplicate=False)
42
+
43
+
35
44
  @llm([clean_news_config, summarized_news_config])
36
45
  class News(ImportMixin, WBModel):
37
46
  errors = {
@@ -39,21 +48,29 @@ class News(ImportMixin, WBModel):
39
48
  }
40
49
  import_export_handler_class = NewsImportHandler
41
50
 
42
- datetime = models.DateTimeField(verbose_name=_("Datetime"))
51
+ datetime = models.DateTimeField(verbose_name=_("Datetime"), default=timezone.now)
43
52
  title = models.CharField(max_length=500, verbose_name=_("Title"))
53
+ guid = models.CharField(max_length=1024, unique=True)
44
54
  description = models.TextField(blank=True, verbose_name=_("Description"))
45
55
  summary = models.TextField(blank=True, verbose_name=_("Summary"))
46
56
  language = models.CharField(max_length=16, choices=LANGUAGES, blank=True, null=True, verbose_name=_("Language"))
47
- link = models.CharField(max_length=500, blank=True, null=True, verbose_name=_("Link"))
57
+ link = models.URLField(max_length=1024, blank=True, null=True, verbose_name=_("Link"))
48
58
  tags = ArrayField(models.CharField(max_length=16), default=list)
49
59
  enclosures = ArrayField(models.URLField(), default=list)
50
60
  source = models.ForeignKey(
51
61
  "wbnews.NewsSource", on_delete=models.CASCADE, related_name="news", verbose_name=_("Source")
52
62
  )
53
63
  image_url = models.URLField(blank=True, null=True)
64
+ mark_as_duplicate = models.BooleanField(default=False, verbose_name=_("Mark as duplicate"))
54
65
 
55
- class Meta:
56
- unique_together = ["title", "source", "datetime"]
66
+ objects = DefaultObjectManager()
67
+ all_objects = models.Manager()
68
+
69
+ def save(self, *args, **kwargs):
70
+ self.datetime = min(self.datetime, timezone.now()) # we ensure a news is never in the future
71
+ if self.guid is None:
72
+ self.guid = self.get_default_guid(self.title, self.link)
73
+ super().save(*args, **kwargs)
57
74
 
58
75
  def __str__(self) -> str:
59
76
  return f"{self.title} ({self.source.title})"
@@ -64,7 +81,8 @@ class News(ImportMixin, WBModel):
64
81
  """
65
82
  tasks = []
66
83
  for sender, task_signature in create_news_relationships.send(sender=News, instance=self):
67
- assert isinstance(task_signature, Signature), self.errors["relationship_signal"].format(sender)
84
+ if not isinstance(task_signature, Signature):
85
+ raise AssertionError(self.errors["relationship_signal"].format(sender))
68
86
  tasks.append(task_signature)
69
87
  if tasks:
70
88
  res = chord(tasks, create_relationship.s(self.id))
@@ -73,22 +91,11 @@ class News(ImportMixin, WBModel):
73
91
  else:
74
92
  res.apply_async()
75
93
 
76
- # TODO: Consider moving this into a get_or_create queryset method on NewsSource?
77
94
  @classmethod
78
- def source_dict_to_model(cls, data: dict) -> NewsSource:
79
- sources = NewsSource.objects
80
- if "id" in data:
81
- return sources.get(id=data["id"])
82
- if identifier := data.get("identifier"):
83
- sources = sources.filter(identifier=identifier)
84
- elif url := data.get("url"):
85
- sources = sources.filter(url=url)
86
- elif title := data.get("title"):
87
- sources = sources.filter(title=title)
88
- if sources.count() == 1:
89
- return sources.first()
90
- else:
91
- return NewsSource.objects.create(**data)
95
+ def get_default_guid(cls, title: str, link: str | None, max_length: int = 1024) -> str:
96
+ if link:
97
+ return link
98
+ return slugify(title)[0:max_length]
92
99
 
93
100
  @classmethod
94
101
  def get_representation_endpoint(cls) -> str:
@@ -106,6 +113,13 @@ class News(ImportMixin, WBModel):
106
113
  def get_endpoint_basename(cls) -> str:
107
114
  return "wbnews:news"
108
115
 
116
+ @classmethod
117
+ def handle_duplicates(cls, start: date, end: date, content_label: str = "description", threshold: float = 0.9):
118
+ qs = News.objects.filter(datetime__gte=start, datetime__lte=end)
119
+ data = dict(qs.values_list("id", content_label))
120
+ duplicate_ids = detect_near_duplicates(data, threshold=threshold)
121
+ qs.filter(id__in=duplicate_ids).update(mark_as_duplicate=True)
122
+
109
123
 
110
124
  @receiver(post_save, sender="wbnews.News")
111
125
  def post_save_create_news_relationships(sender: type, instance: "News", raw: bool, created: bool, **kwargs):