webtoolkit 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: webtoolkit
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Web Toolkit
5
5
  License: GPL3
6
6
  Author: Iwan Grozny
@@ -15,16 +15,19 @@ Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
16
  Requires-Dist: beautifulsoup4 (>=4.13.5,<5.0.0)
17
17
  Requires-Dist: brutefeedparser (>=0.10.5,<0.11.0)
18
- Requires-Dist: feedparser (>=6.0.12,<7.0.0)
19
18
  Requires-Dist: flask
20
- Requires-Dist: lxml (>=5.3.0,<6.0.0)
19
+ Requires-Dist: lxml (>=5.4.0,<6.0.0)
21
20
  Requires-Dist: psutil
22
21
  Requires-Dist: python-dateutil (>=2.8.2,<3.0.0)
23
22
  Requires-Dist: pytz (>=2024.2,<2025.0)
24
23
  Requires-Dist: requests (>=2.32.3,<3.0.0)
25
24
  Requires-Dist: tldextract (>=5.1.2,<6.0.0)
26
25
  Requires-Dist: url-cleaner
27
- Requires-Dist: yt-dlp (>=2025.9.5,<2026.0.0)
28
26
  Description-Content-Type: text/markdown
29
27
 
30
28
  # webtoolkit
29
+
30
+ - Tools necessary for web developers
31
+ - Status handling
32
+ - HTML and RSS page definitions
33
+
@@ -0,0 +1,5 @@
1
+ # webtoolkit
2
+
3
+ - Tools necessary for web developers
4
+ - Status handling
5
+ - HTML and RSS page definitions
@@ -3,7 +3,7 @@
3
3
 
4
4
  [tool.poetry]
5
5
  name = "webtoolkit"
6
- version = "0.0.2"
6
+ version = "0.0.4"
7
7
  description = "Web Toolkit"
8
8
  authors = ["Iwan Grozny <renegat@renegat0x0.ddns.net>"]
9
9
  license = "GPL3"
@@ -11,12 +11,10 @@ readme = "README.md"
11
11
 
12
12
  [tool.poetry.dependencies]
13
13
  python = "^3.9"
14
- feedparser = "^6.0.12"
15
14
  python-dateutil = "^2.8.2"
16
- yt-dlp = "^2025.9.5"
17
15
  tldextract = "^5.1.2"
18
16
  beautifulsoup4 = "^4.13.5"
19
- lxml = "^5.3.0"
17
+ lxml = "^5.4.0"
20
18
  brutefeedparser="^0.10.5"
21
19
  pytz = "^2024.2"
22
20
  psutil="*"
@@ -3,11 +3,23 @@ Similar project: https://pypi.org/project/abstract-webtools/
3
3
  """
4
4
 
5
5
  from .webtools import *
6
- from .pages import *
7
6
  from .webconfig import WebConfig
8
- from .urllocation import UrlLocation
9
7
 
8
+ from .contentinterface import ContentInterface
9
+ from .contentlinkparser import ContentLinkParser
10
+ from .urllocation import UrlLocation
10
11
  from .remoteserver import RemoteServer
12
+
13
+ from .pages import (
14
+ DefaultContentPage,
15
+ HtmlPage,
16
+ RssPage,
17
+ RssContentReader,
18
+ OpmlPage,
19
+ JsonPage,
20
+ PageFactory,
21
+ )
22
+
11
23
  from .contentmoderation import (
12
24
  UrlPropertyValidator,
13
25
  UrlPropertyValidator,
@@ -0,0 +1,398 @@
1
+ """
2
+ Provides interface and page types Html, RSS, JSON etc.
3
+ """
4
+
5
+ from time import strptime
6
+ import re
7
+ from datetime import datetime
8
+ from dateutil import parser
9
+
10
+ from utils.dateutils import DateUtils
11
+
12
+ from .webtools import (
13
+ calculate_hash,
14
+ WebLogger,
15
+ )
16
+ from .urllocation import UrlLocation
17
+
18
+
19
+ class ContentInterface(object):
20
+ def __init__(self, url, contents):
21
+ self.url = url
22
+ self.contents = contents
23
+
24
+ def get_contents(self):
25
+ return self.contents
26
+
27
+ def get_title(self):
28
+ raise NotImplementedError
29
+
30
+ def get_description(self):
31
+ raise NotImplementedError
32
+
33
+ def get_language(self):
34
+ raise NotImplementedError
35
+
36
+ def get_thumbnail(self):
37
+ raise NotImplementedError
38
+
39
+ def get_author(self):
40
+ raise NotImplementedError
41
+
42
+ def get_album(self):
43
+ raise NotImplementedError
44
+
45
+ def get_tags(self):
46
+ raise NotImplementedError
47
+
48
+ def get_url(self):
49
+ return self.url
50
+
51
+ def get_canonical_url(self):
52
+ return self.url
53
+
54
+ def get_feeds(self):
55
+ return []
56
+
57
+ def get_page_rating(self):
58
+ """
59
+ Default behavior
60
+ """
61
+ rating_vector = self.get_page_rating_vector()
62
+ link_rating = self.get_link_rating()
63
+ rating_vector.extend(link_rating)
64
+
65
+ page_rating = 0
66
+ max_page_rating = 0
67
+ for rating in rating_vector:
68
+ page_rating += rating[0]
69
+ max_page_rating += rating[1]
70
+
71
+ if page_rating == 0:
72
+ return 0
73
+ if max_page_rating == 0:
74
+ return 0
75
+
76
+ page_rating = (float(page_rating) * 100.0) / float(max_page_rating)
77
+
78
+ try:
79
+ return int(page_rating)
80
+ except ValueError:
81
+ return 0
82
+
83
+ def get_page_rating_vector(self):
84
+ """
85
+ Returns vector of tuples.
86
+ Each tuple contains actual rating for property, and max rating for that property
87
+ """
88
+ result = []
89
+
90
+ if self.get_title() is not None and str(self.get_title()) != "":
91
+ result.append([10, 10])
92
+
93
+ if self.get_description() is not None and str(self.get_description()) != "":
94
+ result.append([5, 5])
95
+
96
+ if self.get_language() is not None and str(self.get_language()) != "":
97
+ result.append([1, 1])
98
+
99
+ if self.get_thumbnail() is not None and str(self.get_thumbnail()) != "":
100
+ result.append([1, 1])
101
+
102
+ if (
103
+ self.get_date_published() is not None
104
+ and str(self.get_date_published()) != ""
105
+ ):
106
+ result.append([1, 1])
107
+
108
+ return result
109
+
110
+ def get_date_published(self):
111
+ """
112
+ This should be date. Not string
113
+ """
114
+ raise NotImplementedError
115
+
116
+ def get_contents_hash(self):
117
+ contents = self.get_contents()
118
+ if contents:
119
+ return calculate_hash(contents)
120
+
121
+ def get_contents_body_hash(self):
122
+ return self.get_contents_hash()
123
+
124
+ def get_properties(self):
125
+ props = {}
126
+
127
+ props["link"] = self.url
128
+ props["title"] = self.get_title()
129
+ props["description"] = self.get_description()
130
+ props["author"] = self.get_author()
131
+ props["album"] = self.get_album()
132
+ props["thumbnail"] = self.get_thumbnail()
133
+ props["language"] = self.get_language()
134
+ props["page_rating"] = self.get_page_rating()
135
+ props["date_published"] = self.get_date_published()
136
+ props["tags"] = self.get_tags()
137
+ props["link_canonical"] = self.get_canonical_url()
138
+
139
+ return props
140
+
141
+ def is_cloudflare_protected(self):
142
+ """
143
+ Should not obtain contents by itself
144
+
145
+ You'd probably be more successful trying to not trigger
146
+ the bot detection in the first place rather than trying to bypass it after the fact.
147
+ """
148
+ contents = self.contents
149
+
150
+ if contents:
151
+ if contents.find("https://challenges.cloudflare.com") >= 0:
152
+ return True
153
+
154
+ return False
155
+
156
+ def guess_date(self):
157
+ """
158
+ This is ugly, but dateutil.parser does not work. May generate exceptions.
159
+ Ugly is better than not working.
160
+
161
+ Supported formats:
162
+ - Jan. 15, 2024
163
+ - Jan 15, 2024
164
+ - January 15, 2024
165
+ - 15 January 2024 14:48 UTC
166
+ """
167
+
168
+ content = self.get_contents()
169
+ if not content:
170
+ return
171
+
172
+ # searching will be case insensitive
173
+ content = content.lower()
174
+
175
+ # Get the current year
176
+ try:
177
+ current_year = int(datetime.now().year)
178
+ except ValueError:
179
+ # TODO fix this
180
+ current_year = 2024
181
+
182
+ # Define regular expressions
183
+ current_year_pattern = re.compile(rf"\b{current_year}\b")
184
+ four_digit_number_pattern = re.compile(r"\b\d{4}\b")
185
+
186
+ # Attempt to find the current year in the string
187
+ match_current_year = current_year_pattern.search(content)
188
+
189
+ year = None
190
+ scope = None
191
+
192
+ if match_current_year:
193
+ try:
194
+ year = int(current_year)
195
+ except ValueError:
196
+ # TODO fix this
197
+ year = 2024
198
+
199
+ # Limit the scope to a specific portion before and after year
200
+ scope = content[
201
+ max(0, match_current_year.start() - 15) : match_current_year.start()
202
+ + 20
203
+ ]
204
+ else:
205
+ match_four_digit_number = four_digit_number_pattern.search(content)
206
+ if match_four_digit_number:
207
+
208
+ try:
209
+ year = int(match_four_digit_number.group(0))
210
+
211
+ # Limit the scope to a specific portion before and after year
212
+ scope = content[
213
+ max(
214
+ 0, match_four_digit_number.start() - 15
215
+ ) : match_four_digit_number.start()
216
+ + 20
217
+ ]
218
+ except ValueError:
219
+ return
220
+
221
+ if scope:
222
+ return self.guess_by_scope(scope, year)
223
+
224
+ def guess_by_scope(self, scope, year):
225
+ date_pattern_iso = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})")
226
+
227
+ month_re = "(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.?"
228
+
229
+ # 2024 jan 23
230
+ date_pattern_us = re.compile(
231
+ r"(\d{4})\s*{}\s*(\d{1,2})".replace("{}", month_re)
232
+ )
233
+ # jan 23 2024
234
+ date_pattern_us2 = re.compile(
235
+ r"{}\s*(\d{1,2})\s*(\d{4})".replace("{}", month_re)
236
+ )
237
+ # 23 jan 2024
238
+ date_pattern_ue = re.compile(
239
+ r"(\d{1,2})\s*{}\s*(\d{4})".replace("{}", month_re)
240
+ )
241
+
242
+ # only Jan 23, without year next by
243
+ month_date_pattern = re.compile(r"\b{}\s*(\d+)\b".replace("{}", month_re))
244
+
245
+ date_pattern_iso_match = date_pattern_iso.search(scope)
246
+ date_pattern_us_match = date_pattern_us.search(scope)
247
+ date_pattern_us2_match = date_pattern_us2.search(scope)
248
+ date_pattern_ue_match = date_pattern_ue.search(scope)
249
+
250
+ month_date_pattern_match = month_date_pattern.search(scope)
251
+
252
+ date_object = None
253
+
254
+ if date_pattern_iso_match:
255
+ year, month, day = date_pattern_iso_match.groups()
256
+ date_object = self.format_date(year, month, day)
257
+
258
+ elif date_pattern_us_match:
259
+ year, month, day = date_pattern_us_match.groups()
260
+ date_object = self.format_date(year, month, day)
261
+
262
+ elif date_pattern_us2_match:
263
+ month, day, year = date_pattern_us2_match.groups()
264
+ date_object = self.format_date(year, month, day)
265
+
266
+ elif date_pattern_ue_match:
267
+ day, month, year = date_pattern_ue_match.groups()
268
+ date_object = self.format_date(year, month, day)
269
+
270
+ # If a month and day are found, construct a datetime object with year, month, and day
271
+ elif month_date_pattern_match:
272
+ month, day = month_date_pattern_match.groups()
273
+ date_object = self.format_date(year, month, day)
274
+
275
+ # elif year:
276
+ # current_year = int(datetime.now().year)
277
+
278
+ # if year >= current_year or year < 1900:
279
+ # date_object = datetime.now()
280
+ # else:
281
+ # # If only the year is found, construct a datetime object with year
282
+ # date_object = datetime(year, 1, 1)
283
+
284
+ # For other scenario to not provide any value
285
+
286
+ if date_object:
287
+ date_object = DateUtils.to_utc_date(date_object)
288
+
289
+ return date_object
290
+
291
+ def format_date(self, year, month, day):
292
+ month_number = None
293
+
294
+ try:
295
+ month_number = int(month)
296
+ month_number = month
297
+ except ValueError as E:
298
+ WebLogger.debug("Error:{}".format(str(E)))
299
+
300
+ if not month_number:
301
+ try:
302
+ month_number = strptime(month, "%b").tm_mon
303
+ month_number = str(month_number)
304
+ except Exception as E:
305
+ WebLogger.debug("Error:{}".format(str(E)))
306
+
307
+ if not month_number:
308
+ try:
309
+ month_number = strptime(month, "%B").tm_mon
310
+ month_number = str(month_number)
311
+ except Exception as E:
312
+ WebLogger.debug("Error:{}".format(str(E)))
313
+
314
+ if month_number is None:
315
+ WebLogger.debug(
316
+ "Guessing date error: URL:{};\nYear:{};\nMonth:{}\nDay:{}".format(
317
+ self.url, year, month, day
318
+ )
319
+ )
320
+ return
321
+
322
+ try:
323
+ date_object = datetime.strptime(
324
+ f"{year}-{month_number.zfill(2)}-{day.zfill(2)}", "%Y-%m-%d"
325
+ )
326
+
327
+ return date_object
328
+ except Exception as E:
329
+ WebLogger.debug(
330
+ "Guessing date error: URL:{};\nYear:{};\nMonth:{}\nDay:{}".format(
331
+ self.url, year, month, day
332
+ )
333
+ )
334
+
335
+ def get_position_of_html_tags(self):
336
+ if not self.contents:
337
+ return -1
338
+
339
+ lower = self.contents.lower()
340
+ if lower.find("<html") >= 0 and lower.find("<body") >= 0:
341
+ return lower.find("<html")
342
+
343
+ lower = self.contents.lower()
344
+ if lower.find("<html") >= 0 and lower.find("<meta") >= 0:
345
+ return lower.find("<html")
346
+
347
+ return -1
348
+
349
+ def get_position_of_rss_tags(self):
350
+ if not self.contents:
351
+ return -1
352
+
353
+ lower = self.contents.lower()
354
+ if lower.find("<rss") >= 0 and lower.find("<channel") >= 0:
355
+ return lower.find("<rss")
356
+ if lower.find("<feed") >= 0 and lower.find("<entry") >= 0:
357
+ return lower.find("<feed")
358
+ if lower.find("<rdf") >= 0 and lower.find("<channel") >= 0:
359
+ return lower.find("<rdf")
360
+
361
+ return -1
362
+
363
+ def get_link_rating(self):
364
+ rating = []
365
+
366
+ if self.url.startswith("https://"):
367
+ rating.append([1, 1])
368
+ elif self.url.startswith("ftp://"):
369
+ rating.append([1, 1])
370
+ elif self.url.startswith("smb://"):
371
+ rating.append([1, 1])
372
+ elif self.url.startswith("http://"):
373
+ rating.append([0, 1])
374
+ else:
375
+ rating.append([0, 1])
376
+
377
+ p = UrlLocation(self.url)
378
+ if p.is_domain():
379
+ rating.append([1, 1])
380
+
381
+ domain_only = p.get_domain_only()
382
+ if domain_only.count(".") == 1:
383
+ rating.append([2, 2])
384
+ elif domain_only.count(".") == 2:
385
+ rating.append([1, 2])
386
+ else:
387
+ rating.append([0, 2])
388
+
389
+ # as example https://www.youtube.com has 23 chars
390
+
391
+ if len(self.url) < 25:
392
+ rating.append([2, 2])
393
+ elif len(self.url) < 30:
394
+ rating.append([1, 2])
395
+ else:
396
+ rating.append([0, 2])
397
+
398
+ return rating