wbnews 2.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wbnews/.coveragerc +23 -0
- wbnews/__init__.py +1 -0
- wbnews/admin.py +27 -0
- wbnews/apps.py +9 -0
- wbnews/factories.py +33 -0
- wbnews/import_export/__init__.py +0 -0
- wbnews/import_export/backends/__init__.py +1 -0
- wbnews/import_export/backends/news.py +35 -0
- wbnews/import_export/handlers/__init__.py +1 -0
- wbnews/import_export/handlers/news.py +25 -0
- wbnews/import_export/parsers/__init__.py +0 -0
- wbnews/import_export/parsers/emails/__init__.py +0 -0
- wbnews/import_export/parsers/emails/news.py +48 -0
- wbnews/import_export/parsers/emails/utils.py +61 -0
- wbnews/import_export/parsers/rss/__init__.py +0 -0
- wbnews/import_export/parsers/rss/news.py +63 -0
- wbnews/migrations/0001_initial_squashed_0005_alter_news_import_source.py +349 -0
- wbnews/migrations/0006_alter_news_language.py +122 -0
- wbnews/migrations/0007_auto_20240103_0955.py +43 -0
- wbnews/migrations/0008_alter_news_language.py +123 -0
- wbnews/migrations/0009_newsrelationship_analysis_newsrelationship_sentiment.py +94 -0
- wbnews/migrations/__init__.py +0 -0
- wbnews/models/__init__.py +3 -0
- wbnews/models/news.py +116 -0
- wbnews/models/relationships.py +20 -0
- wbnews/models/sources.py +43 -0
- wbnews/serializers.py +83 -0
- wbnews/signals.py +4 -0
- wbnews/tests/__init__.py +0 -0
- wbnews/tests/conftest.py +6 -0
- wbnews/tests/test_models.py +15 -0
- wbnews/tests/tests.py +12 -0
- wbnews/urls.py +29 -0
- wbnews/viewsets/__init__.py +12 -0
- wbnews/viewsets/buttons.py +23 -0
- wbnews/viewsets/display.py +133 -0
- wbnews/viewsets/endpoints.py +18 -0
- wbnews/viewsets/menu.py +23 -0
- wbnews/viewsets/titles.py +39 -0
- wbnews/viewsets/views.py +140 -0
- wbnews-2.2.1.dist-info/METADATA +8 -0
- wbnews-2.2.1.dist-info/RECORD +43 -0
- wbnews-2.2.1.dist-info/WHEEL +5 -0
wbnews/.coveragerc
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[report]
|
|
2
|
+
exclude_lines =
|
|
3
|
+
print()
|
|
4
|
+
def api_endpoints_root
|
|
5
|
+
def get_or_create_model_sql
|
|
6
|
+
def profile_check
|
|
7
|
+
if hasattr
|
|
8
|
+
raise Exception
|
|
9
|
+
raise Http404
|
|
10
|
+
except:
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
[run]
|
|
14
|
+
omit = */migrations/*
|
|
15
|
+
manage.py
|
|
16
|
+
*/tests/*
|
|
17
|
+
*/wbnews_config/*
|
|
18
|
+
*/apps.py
|
|
19
|
+
*/docs/*
|
|
20
|
+
*/dynamic_preferences_registry.py
|
|
21
|
+
*/permissions.py
|
|
22
|
+
*/preferences/*
|
|
23
|
+
*/.venv/*
|
wbnews/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
wbnews/admin.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from django.contrib import admin
|
|
2
|
+
|
|
3
|
+
from .models import News, NewsRelationship, NewsSource
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@admin.register(NewsRelationship)
|
|
7
|
+
class NewsRelationshipAdmin(admin.ModelAdmin):
|
|
8
|
+
list_display = ["news", "content_object"]
|
|
9
|
+
autocomplete_fields = ["news"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@admin.register(News)
|
|
13
|
+
class NewsAdmin(admin.ModelAdmin):
|
|
14
|
+
search_fields = ("title", "description")
|
|
15
|
+
raw_id_fields = ["import_source"]
|
|
16
|
+
autocomplete_fields = [
|
|
17
|
+
"source",
|
|
18
|
+
]
|
|
19
|
+
list_display = ["title", "language", "tags", "source", "datetime"]
|
|
20
|
+
|
|
21
|
+
list_filter = ("source",)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@admin.register(NewsSource)
|
|
25
|
+
class NewsSourceAdmin(admin.ModelAdmin):
|
|
26
|
+
search_fields = ("type", "title", "identifier", "description", "author", "url")
|
|
27
|
+
list_filter = ("type",)
|
wbnews/apps.py
ADDED
wbnews/factories.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import factory
|
|
2
|
+
from django.conf.global_settings import LANGUAGES
|
|
3
|
+
from django.utils import timezone
|
|
4
|
+
from faker import Factory
|
|
5
|
+
from wbnews.models import News, NewsSource
|
|
6
|
+
|
|
7
|
+
langs = [n for (n, v) in LANGUAGES]
|
|
8
|
+
faker = Factory.create()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NewsSourceFactory(factory.django.DjangoModelFactory):
|
|
12
|
+
title = factory.Sequence(lambda n: f"source_{n}")
|
|
13
|
+
identifier = factory.Sequence(lambda n: f"http://myurl_{n}.com")
|
|
14
|
+
image = faker.url()
|
|
15
|
+
description = factory.Faker("sentence", nb_words=32)
|
|
16
|
+
author = faker.name()
|
|
17
|
+
url = factory.Faker("url")
|
|
18
|
+
|
|
19
|
+
class Meta:
|
|
20
|
+
model = NewsSource
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NewsFactory(factory.django.DjangoModelFactory):
|
|
24
|
+
datetime = factory.LazyFunction(timezone.now)
|
|
25
|
+
title = factory.Sequence(lambda n: f"news_{n}")
|
|
26
|
+
description = factory.Faker("sentence", nb_words=32)
|
|
27
|
+
summary = factory.Faker("sentence", nb_words=32)
|
|
28
|
+
language = factory.Iterator(langs)
|
|
29
|
+
link = faker.url()
|
|
30
|
+
source = factory.SubFactory(NewsSourceFactory)
|
|
31
|
+
|
|
32
|
+
class Meta:
|
|
33
|
+
model = News
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .news import DataBackend
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import Generator
|
|
5
|
+
|
|
6
|
+
import feedparser
|
|
7
|
+
from django.db.models import QuerySet
|
|
8
|
+
from slugify import slugify
|
|
9
|
+
from wbcore.contrib.io.backends.abstract import AbstractDataBackend
|
|
10
|
+
from wbcore.contrib.io.backends.utils import register
|
|
11
|
+
from wbnews.models import NewsSource
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register("News RSS Backend", save_data_in_import_source=True)
|
|
15
|
+
class DataBackend(AbstractDataBackend):
|
|
16
|
+
def is_object_valid(self, obj: "NewsSource") -> bool:
|
|
17
|
+
return obj.type == NewsSource.Type.RSS and obj.is_active and obj.url
|
|
18
|
+
|
|
19
|
+
def get_default_queryset(self) -> QuerySet["NewsSource"]:
|
|
20
|
+
return NewsSource.objects.filter(type=NewsSource.Type.RSS, is_active=True, url__isnull=False)
|
|
21
|
+
|
|
22
|
+
def get_files(
|
|
23
|
+
self, execution_time: datetime, queryset=None, **kwargs
|
|
24
|
+
) -> Generator[tuple[str, BytesIO], None, None] | None:
|
|
25
|
+
if queryset is not None:
|
|
26
|
+
for source in queryset:
|
|
27
|
+
data = feedparser.parse(source.url)
|
|
28
|
+
if not data.get("bozo_exception"):
|
|
29
|
+
data["news_source"] = source.id
|
|
30
|
+
content_file = BytesIO()
|
|
31
|
+
content_file.write(json.dumps(data).encode())
|
|
32
|
+
file_name = (
|
|
33
|
+
f"{slugify(source.title, separator='_')}_rss_file_{datetime.timestamp(execution_time)}.json"
|
|
34
|
+
)
|
|
35
|
+
yield file_name, content_file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .news import NewsImportHandler
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
import pytz
|
|
5
|
+
from django.db import models
|
|
6
|
+
from django.utils import timezone
|
|
7
|
+
from wbcore.contrib.io.imports import ImportExportHandler
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NewsImportHandler(ImportExportHandler):
|
|
11
|
+
MODEL_APP_LABEL = "wbnews.News"
|
|
12
|
+
|
|
13
|
+
def _deserialize(self, data: Dict[str, Any]):
|
|
14
|
+
data["source"] = self.model.source_dict_to_model(data["source"])
|
|
15
|
+
if parsed_datetime := data.get("datetime", None):
|
|
16
|
+
data["datetime"] = pytz.utc.localize(datetime.strptime(parsed_datetime, "%Y-%m-%dT%H:%M:%S"))
|
|
17
|
+
else:
|
|
18
|
+
data["datetime"] = timezone.now()
|
|
19
|
+
|
|
20
|
+
def _get_instance(self, data: Dict[str, Any], history: Optional[models.QuerySet] = None, **kwargs) -> models.Model:
|
|
21
|
+
return self.model.objects.filter(source=data["source"], datetime=data["datetime"], title=data["title"]).first()
|
|
22
|
+
|
|
23
|
+
def _create_instance(self, data: Dict[str, Any], **kwargs) -> models.Model:
|
|
24
|
+
self.import_source.log += "\nCreate News."
|
|
25
|
+
return self.model.objects.create(**data, import_source=self.import_source)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from contextlib import suppress
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from django.conf.global_settings import LANGUAGES
|
|
6
|
+
from langdetect import detect, lang_detect_exception
|
|
7
|
+
from wbcore.utils.importlib import import_from_dotted_path
|
|
8
|
+
|
|
9
|
+
from .utils import EmlContentParser
|
|
10
|
+
|
|
11
|
+
languages_dict = dict(LANGUAGES)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def clean_string_with_paragraphs(string):
|
|
15
|
+
return re.sub(r" +", " ", re.sub(r"(?<!\.)\\n", " ", string.strip()))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def parse(import_source):
|
|
19
|
+
parser = EmlContentParser(
|
|
20
|
+
import_source.file.read(), encoding=import_source.source.import_parameters.get("email_encoding", "latin-1")
|
|
21
|
+
)
|
|
22
|
+
email_date = parser.date if parser.date else datetime.now()
|
|
23
|
+
|
|
24
|
+
# Source
|
|
25
|
+
html = parser.html
|
|
26
|
+
|
|
27
|
+
# If source define a custom html parser, we import it and convert the returned html
|
|
28
|
+
if html_parser_path := import_source.source.import_parameters.get("html_parser", None):
|
|
29
|
+
with suppress(ModuleNotFoundError):
|
|
30
|
+
html_parser = import_from_dotted_path(html_parser_path)
|
|
31
|
+
html = html_parser(html)
|
|
32
|
+
|
|
33
|
+
data = {
|
|
34
|
+
"datetime": email_date.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
35
|
+
"title": parser.subject.replace(f"[{import_source.source.uuid}]", ""),
|
|
36
|
+
"description": html,
|
|
37
|
+
"source": parser.source,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Language
|
|
41
|
+
try:
|
|
42
|
+
language = detect(data["description"])
|
|
43
|
+
if language in languages_dict:
|
|
44
|
+
data["language"] = language
|
|
45
|
+
except lang_detect_exception.LangDetectException:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
return {"data": [data]}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from email import message, parser
|
|
2
|
+
from email.utils import parseaddr, parsedate_to_datetime
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EmlContentParser:
|
|
6
|
+
def __init__(self, email: bytes, encoding: str = "latin-1"):
|
|
7
|
+
self.message = parser.BytesParser().parsebytes(email)
|
|
8
|
+
self.encoding = encoding
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def date(self):
|
|
12
|
+
if date_str := self.message.get("date"):
|
|
13
|
+
return parsedate_to_datetime(date_str)
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def subject(self) -> str:
|
|
17
|
+
return self.message.get("subject", "")
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def html(self):
|
|
21
|
+
html = self.get_html(self.message)
|
|
22
|
+
return html.decode(self.encoding) if html else None
|
|
23
|
+
|
|
24
|
+
def get_html(cls, parsed: message.Message) -> bytes | None:
|
|
25
|
+
if parsed.is_multipart():
|
|
26
|
+
for item in parsed.get_payload(): # type:message.Message
|
|
27
|
+
if html := cls.get_html(item):
|
|
28
|
+
return html
|
|
29
|
+
elif parsed.get_content_type() == "text/html":
|
|
30
|
+
return parsed.get_payload(decode=True)
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def text(self):
|
|
35
|
+
text = self.get_text(self.message)
|
|
36
|
+
return text.decode(self.encoding) if text else None
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def get_text(cls, parsed: message.Message) -> bytes | None:
|
|
40
|
+
if parsed.is_multipart():
|
|
41
|
+
for item in parsed.get_payload():
|
|
42
|
+
if text := cls.get_text(item):
|
|
43
|
+
return text
|
|
44
|
+
elif parsed.get_content_type() == "text/plain":
|
|
45
|
+
return parsed.get_payload(decode=True)
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def source(self) -> dict[str, any]:
|
|
50
|
+
name, email = parseaddr(self.message["From"])
|
|
51
|
+
if not name:
|
|
52
|
+
name = "Generic"
|
|
53
|
+
if not email:
|
|
54
|
+
email = "generic"
|
|
55
|
+
source = {
|
|
56
|
+
"title": f"{name} Research Email",
|
|
57
|
+
"identifier": "research-email-" + email.lower(),
|
|
58
|
+
"author": name,
|
|
59
|
+
"url": email,
|
|
60
|
+
}
|
|
61
|
+
return source
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from time import mktime
|
|
4
|
+
|
|
5
|
+
from django.conf.global_settings import LANGUAGES
|
|
6
|
+
from langdetect import detect, lang_detect_exception
|
|
7
|
+
|
|
8
|
+
languages_dict = dict(LANGUAGES)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _get_source(d):
|
|
12
|
+
source = {}
|
|
13
|
+
if source_id := d.get("news_source"):
|
|
14
|
+
source["id"] = source_id
|
|
15
|
+
else:
|
|
16
|
+
if "title" in d["feed"]:
|
|
17
|
+
source["title"] = d["feed"]["title"]
|
|
18
|
+
if "author" in d["feed"]:
|
|
19
|
+
source["author"] = d["feed"]["author"]
|
|
20
|
+
if "image" in d["feed"]:
|
|
21
|
+
source["image"] = d["feed"]["image"]["href"]
|
|
22
|
+
if "href" in d["feed"]:
|
|
23
|
+
source["identifier"] = d["feed"]["href"]
|
|
24
|
+
if "link" in d["feed"]:
|
|
25
|
+
source["url"] = d["feed"]["link"]
|
|
26
|
+
return source
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse(import_source):
|
|
30
|
+
content = json.load(import_source.file)
|
|
31
|
+
data = []
|
|
32
|
+
source = _get_source(content)
|
|
33
|
+
|
|
34
|
+
for entry in content["entries"]:
|
|
35
|
+
if summary := entry.get("summary", None):
|
|
36
|
+
description = entry.get("description", summary)
|
|
37
|
+
res = {
|
|
38
|
+
"description": description,
|
|
39
|
+
"summary": summary,
|
|
40
|
+
"source": source,
|
|
41
|
+
"title": entry.get("title", ""),
|
|
42
|
+
"link": entry.get("link", None),
|
|
43
|
+
}
|
|
44
|
+
try:
|
|
45
|
+
language = detect(entry["summary"])
|
|
46
|
+
if language in languages_dict:
|
|
47
|
+
res["language"] = language
|
|
48
|
+
except lang_detect_exception.LangDetectException:
|
|
49
|
+
pass
|
|
50
|
+
if published_parsed := entry.get("published_parsed", None):
|
|
51
|
+
updated = datetime.fromtimestamp(mktime(tuple(published_parsed)))
|
|
52
|
+
res["datetime"] = updated.strftime("%Y-%m-%dT%H:%M:%S")
|
|
53
|
+
if enclosures := entry.get("enclosures", None):
|
|
54
|
+
res["enclosures"] = [e.get("href", "") for e in enclosures]
|
|
55
|
+
if (
|
|
56
|
+
(media_content := entry.get("media_content", []))
|
|
57
|
+
and isinstance(media_content, list)
|
|
58
|
+
and len(media_content) > 0
|
|
59
|
+
and (image_url := media_content[0].get("url", None))
|
|
60
|
+
):
|
|
61
|
+
res["image_url"] = image_url
|
|
62
|
+
data.append(res)
|
|
63
|
+
return {"data": data}
|