webchanges 3.29.0__tar.gz → 3.30.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webchanges-3.29.0/webchanges.egg-info → webchanges-3.30.0}/PKG-INFO +4 -3
- {webchanges-3.29.0 → webchanges-3.30.0}/README.rst +3 -2
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/__init__.py +1 -1
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/cli.py +11 -6
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/config.py +2 -1
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/differs.py +93 -13
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/filters.py +74 -29
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/jobs.py +12 -12
- {webchanges-3.29.0 → webchanges-3.30.0/webchanges.egg-info}/PKG-INFO +4 -3
- {webchanges-3.29.0 → webchanges-3.30.0}/LICENSE +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/MANIFEST.in +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/pyproject.toml +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/requirements.txt +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/setup.cfg +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/__main__.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/_vendored/__init__.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/_vendored/headers.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/_vendored/packaging_version.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/command.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/handler.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/mailer.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/main.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/py.typed +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/reporters.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/storage.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/storage_minidb.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/util.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/worker.py +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/SOURCES.txt +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/dependency_links.txt +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/entry_points.txt +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/requires.txt +0 -0
- {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webchanges
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.30.0
|
|
4
4
|
Summary: Web Changes Delivered. AI-Summarized. Totally Anonymous.
|
|
5
5
|
Author-email: Mike Borsetti <mike+webchanges@borsetti.com>
|
|
6
6
|
Maintainer-email: Mike Borsetti <mike+webchanges@borsetti.com>
|
|
@@ -206,8 +206,9 @@ Install **webchanges** with:
|
|
|
206
206
|
|
|
207
207
|
Running in Docker
|
|
208
208
|
-----------------
|
|
209
|
-
**webchanges** can easily run in a
|
|
210
|
-
`here <https://github.com/yubiuser/webchanges-docker>`__
|
|
209
|
+
**webchanges** can easily run in a `Docker <https://www.docker.com/>`__ container! You will find a minimal
|
|
210
|
+
implementation (no browser) `here <https://github.com/yubiuser/webchanges-docker>`__, and one with a browser
|
|
211
|
+
`here <https://github.com/jhedlund/webchanges-docker>`__.
|
|
211
212
|
|
|
212
213
|
|
|
213
214
|
Documentation |readthedocs|
|
|
@@ -39,8 +39,9 @@ Install **webchanges** with:
|
|
|
39
39
|
|
|
40
40
|
Running in Docker
|
|
41
41
|
-----------------
|
|
42
|
-
**webchanges** can easily run in a
|
|
43
|
-
`here <https://github.com/yubiuser/webchanges-docker>`__
|
|
42
|
+
**webchanges** can easily run in a `Docker <https://www.docker.com/>`__ container! You will find a minimal
|
|
43
|
+
implementation (no browser) `here <https://github.com/yubiuser/webchanges-docker>`__, and one with a browser
|
|
44
|
+
`here <https://github.com/jhedlund/webchanges-docker>`__.
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
Documentation |readthedocs|
|
|
@@ -22,7 +22,7 @@ __project_name__ = __package__
|
|
|
22
22
|
# * MINOR version when you add functionality in a backwards compatible manner, and
|
|
23
23
|
# * MICRO or PATCH version when you make backwards compatible bug fixes. We no longer use '0'
|
|
24
24
|
# If unsure on increments, use pkg_resources.parse_version to parse
|
|
25
|
-
__version__ = '3.
|
|
25
|
+
__version__ = '3.30.0'
|
|
26
26
|
__description__ = (
|
|
27
27
|
'Check web (or command output) for changes since last run and notify.\n'
|
|
28
28
|
'\n'
|
|
@@ -241,11 +241,16 @@ def first_run(command_config: CommandConfig) -> None:
|
|
|
241
241
|
print(f'> Edit it with {__project_name__} --edit')
|
|
242
242
|
|
|
243
243
|
|
|
244
|
-
def load_hooks(hooks_file: Path) -> None:
|
|
244
|
+
def load_hooks(hooks_file: Path, is_default: bool = False) -> None:
|
|
245
245
|
"""Load hooks file."""
|
|
246
246
|
if not hooks_file.is_file():
|
|
247
|
-
|
|
248
|
-
|
|
247
|
+
if is_default:
|
|
248
|
+
logger.info(f'Hooks file {hooks_file} does not exist or is not a file')
|
|
249
|
+
else:
|
|
250
|
+
# do not use ImportWarning as it could be suppressed
|
|
251
|
+
warnings.warn(
|
|
252
|
+
f'Hooks file {hooks_file} not imported because it does not exist or is not a file', RuntimeWarning
|
|
253
|
+
)
|
|
249
254
|
return
|
|
250
255
|
|
|
251
256
|
hooks_file_errors = file_ownership_checks(hooks_file)
|
|
@@ -258,9 +263,9 @@ def load_hooks(hooks_file: Path) -> None:
|
|
|
258
263
|
RuntimeWarning,
|
|
259
264
|
)
|
|
260
265
|
else:
|
|
261
|
-
logger.info(f'Importing hooks module from {hooks_file}')
|
|
266
|
+
logger.info(f'Importing into hooks module from {hooks_file}')
|
|
262
267
|
import_module_from_source('hooks', hooks_file)
|
|
263
|
-
logger.info('Finished importing hooks module')
|
|
268
|
+
logger.info('Finished importing into hooks module')
|
|
264
269
|
|
|
265
270
|
|
|
266
271
|
def handle_unitialized_actions(urlwatch_config: CommandConfig) -> None:
|
|
@@ -406,7 +411,7 @@ def main() -> None: # pragma: no cover
|
|
|
406
411
|
if command_config.hooks_files:
|
|
407
412
|
logger.debug(f'Hooks files to be loaded: {command_config.hooks_files}')
|
|
408
413
|
for hooks_file in command_config.hooks_files:
|
|
409
|
-
load_hooks(hooks_file)
|
|
414
|
+
load_hooks(hooks_file, is_default=not command_config.hooks_files_inputted)
|
|
410
415
|
config_storage.load()
|
|
411
416
|
|
|
412
417
|
# Setup database API
|
|
@@ -50,6 +50,7 @@ class CommandConfig(BaseConfig):
|
|
|
50
50
|
footnote: str | None
|
|
51
51
|
gc_database: int | None
|
|
52
52
|
hooks_files: list[Path]
|
|
53
|
+
hooks_files_inputted: bool
|
|
53
54
|
install_chrome: bool
|
|
54
55
|
joblist: Collection[str | int]
|
|
55
56
|
jobs_files: list[Path]
|
|
@@ -89,6 +90,7 @@ class CommandConfig(BaseConfig):
|
|
|
89
90
|
super().__init__(config_path, config_file, jobs_def_file, hooks_def_file, ssdb_file)
|
|
90
91
|
self.parse_args(args)
|
|
91
92
|
self.jobs_files = self.jobs_files or [jobs_def_file]
|
|
93
|
+
self.hooks_files_inputted = bool(self.hooks_files)
|
|
92
94
|
self.hooks_files = self.hooks_files or [hooks_def_file]
|
|
93
95
|
|
|
94
96
|
class CustomHelpFormatter(argparse.RawDescriptionHelpFormatter):
|
|
@@ -145,7 +147,6 @@ class CommandConfig(BaseConfig):
|
|
|
145
147
|
'--jobs',
|
|
146
148
|
'--urls',
|
|
147
149
|
action='append',
|
|
148
|
-
# default=[self.jobs_def_file],
|
|
149
150
|
type=Path,
|
|
150
151
|
help='read job list (URLs/commands) from FILE or files matching a glob pattern',
|
|
151
152
|
metavar='FILE',
|
|
@@ -27,6 +27,7 @@ from typing import Any, Iterator, Literal, TYPE_CHECKING, TypedDict
|
|
|
27
27
|
from zoneinfo import ZoneInfo
|
|
28
28
|
|
|
29
29
|
import html2text
|
|
30
|
+
import yaml
|
|
30
31
|
|
|
31
32
|
from webchanges.jobs import JobBase
|
|
32
33
|
from webchanges.util import linkify, mark_to_html, TrackSubClasses
|
|
@@ -64,9 +65,11 @@ except ImportError: # pragma: no cover
|
|
|
64
65
|
import json as jsonlib # type: ignore[no-redef]
|
|
65
66
|
|
|
66
67
|
try:
|
|
68
|
+
from xml.parsers.expat import ExpatError
|
|
69
|
+
|
|
67
70
|
import xmltodict
|
|
68
71
|
except ImportError as e: # pragma: no cover
|
|
69
|
-
xmltodict = str(e) # type: ignore[no-redef]
|
|
72
|
+
xmltodict = str(e) # type: ignore[no-redef,assignment]
|
|
70
73
|
|
|
71
74
|
# https://stackoverflow.com/questions/39740632
|
|
72
75
|
if TYPE_CHECKING:
|
|
@@ -743,7 +746,7 @@ class DeepdiffDiffer(DifferBase):
|
|
|
743
746
|
__kind__ = 'deepdiff'
|
|
744
747
|
|
|
745
748
|
__supported_directives__ = {
|
|
746
|
-
'data_type': "either 'json' (default) or 'xml'",
|
|
749
|
+
'data_type': "either 'json' (default), 'yaml', or 'xml'",
|
|
747
750
|
'ignore_order': 'Whether to ignore the order in which the items have appeared (default: false)',
|
|
748
751
|
'ignore_string_case': 'Whether to be case-sensitive or not when comparing strings (default: false)',
|
|
749
752
|
'significant_digits': (
|
|
@@ -859,14 +862,70 @@ class DeepdiffDiffer(DifferBase):
|
|
|
859
862
|
|
|
860
863
|
return '\n'.join(result)
|
|
861
864
|
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
865
|
+
if directives.get('data_type'):
|
|
866
|
+
old_data_type = directives['data_type']
|
|
867
|
+
new_data_type = directives['data_type']
|
|
868
|
+
else:
|
|
869
|
+
if self.state.old_mime_type:
|
|
870
|
+
media_subtype = self.state.old_mime_type.split('/')[-1].split('+')[-1].split('x-')[-1]
|
|
871
|
+
if media_subtype in ('yaml', 'yml'):
|
|
872
|
+
old_data_type = 'yaml'
|
|
873
|
+
elif media_subtype == 'xml':
|
|
874
|
+
old_data_type = 'xml'
|
|
875
|
+
elif media_subtype == 'json':
|
|
876
|
+
old_data_type = 'json'
|
|
877
|
+
else:
|
|
878
|
+
logger.info(
|
|
879
|
+
f'Differ {self.__kind__} could not determine data type of old data from media type '
|
|
880
|
+
f"{self.state.old_mime_type}; defaulting to 'json'"
|
|
881
|
+
)
|
|
882
|
+
old_data_type = 'json'
|
|
883
|
+
else:
|
|
884
|
+
logger.info(
|
|
885
|
+
f"Differ {self.__kind__} data_type for old data defaulted to 'json' as media type is missing"
|
|
886
|
+
)
|
|
887
|
+
old_data_type = 'json'
|
|
888
|
+
if self.state.new_mime_type:
|
|
889
|
+
media_subtype = self.state.new_mime_type.split('/')[-1].split('+')[-1].split('x-')[-1]
|
|
890
|
+
if media_subtype in ('yaml', 'yml'):
|
|
891
|
+
new_data_type = 'yaml'
|
|
892
|
+
elif media_subtype == 'xml':
|
|
893
|
+
new_data_type = 'xml'
|
|
894
|
+
elif media_subtype == 'json':
|
|
895
|
+
new_data_type = 'json'
|
|
896
|
+
else:
|
|
897
|
+
logger.info(
|
|
898
|
+
f'Differ {self.__kind__} could not determine data type of new data from media type '
|
|
899
|
+
f"{self.state.new_mime_type}; defaulting to 'json'"
|
|
900
|
+
)
|
|
901
|
+
new_data_type = 'json'
|
|
902
|
+
else:
|
|
903
|
+
logger.info(
|
|
904
|
+
f"Differ {self.__kind__} data_type for new data defaulted to 'json' as media type is missing"
|
|
905
|
+
)
|
|
906
|
+
new_data_type = 'json'
|
|
907
|
+
|
|
908
|
+
old_data: Any = ''
|
|
909
|
+
if old_data_type == 'json':
|
|
866
910
|
try:
|
|
867
911
|
old_data = jsonlib.loads(self.state.old_data)
|
|
868
912
|
except jsonlib.JSONDecodeError:
|
|
869
|
-
|
|
913
|
+
pass
|
|
914
|
+
elif old_data_type == 'yaml':
|
|
915
|
+
try:
|
|
916
|
+
old_data = yaml.safe_load(self.state.old_data)
|
|
917
|
+
except yaml.YAMLError:
|
|
918
|
+
pass
|
|
919
|
+
elif old_data_type == 'xml':
|
|
920
|
+
if isinstance(xmltodict, str): # pragma: no cover
|
|
921
|
+
self.raise_import_error('xmltodict', xmltodict)
|
|
922
|
+
try:
|
|
923
|
+
old_data = xmltodict.parse(self.state.old_data)
|
|
924
|
+
except ExpatError:
|
|
925
|
+
pass
|
|
926
|
+
|
|
927
|
+
new_data: Any = ''
|
|
928
|
+
if new_data_type == 'json':
|
|
870
929
|
try:
|
|
871
930
|
new_data = jsonlib.loads(self.state.new_data)
|
|
872
931
|
except jsonlib.JSONDecodeError as e:
|
|
@@ -879,12 +938,34 @@ class DeepdiffDiffer(DifferBase):
|
|
|
879
938
|
'markdown': f'Differ {self.__kind__} **ERROR: New data is invalid JSON**\n{e}',
|
|
880
939
|
'html': f'Differ {self.__kind__} <b>ERROR: New data is invalid JSON</b>\n{e}',
|
|
881
940
|
}
|
|
882
|
-
elif
|
|
941
|
+
elif new_data_type == 'yaml':
|
|
942
|
+
try:
|
|
943
|
+
new_data = yaml.safe_load(self.state.new_data)
|
|
944
|
+
except yaml.YAMLError as e:
|
|
945
|
+
self.state.exception = e
|
|
946
|
+
self.state.traceback = self.job.format_error(e, traceback.format_exc())
|
|
947
|
+
logger.error(f'Job {self.job.index_number}: New data is invalid YAML: {e} ({self.job.get_location()})')
|
|
948
|
+
logger.info(f'Job {self.job.index_number}: {self.state.new_data!r}')
|
|
949
|
+
return {
|
|
950
|
+
'text': f'Differ {self.__kind__} ERROR: New data is invalid YAML\n{e}',
|
|
951
|
+
'markdown': f'Differ {self.__kind__} **ERROR: New data is invalid YAML**\n{e}',
|
|
952
|
+
'html': f'Differ {self.__kind__} <b>ERROR: New data is invalid YAML</b>\n{e}',
|
|
953
|
+
}
|
|
954
|
+
elif new_data_type == 'xml':
|
|
883
955
|
if isinstance(xmltodict, str): # pragma: no cover
|
|
884
956
|
self.raise_import_error('xmltodict', xmltodict)
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
957
|
+
try:
|
|
958
|
+
new_data = xmltodict.parse(self.state.new_data)
|
|
959
|
+
except ExpatError as e:
|
|
960
|
+
self.state.exception = e
|
|
961
|
+
self.state.traceback = self.job.format_error(e, traceback.format_exc())
|
|
962
|
+
logger.error(f'Job {self.job.index_number}: New data is invalid XML: {e} ({self.job.get_location()})')
|
|
963
|
+
logger.info(f'Job {self.job.index_number}: {self.state.new_data!r}')
|
|
964
|
+
return {
|
|
965
|
+
'text': f'Differ {self.__kind__} ERROR: New data is invalid XML\n{e}',
|
|
966
|
+
'markdown': f'Differ {self.__kind__} **ERROR: New data is invalid XML**\n{e}',
|
|
967
|
+
'html': f'Differ {self.__kind__} <b>ERROR: New data is invalid XML</b>\n{e}',
|
|
968
|
+
}
|
|
888
969
|
|
|
889
970
|
ignore_order: bool = directives.get('ignore_order') # type: ignore[assignment]
|
|
890
971
|
ignore_string_case: bool = directives.get('ignore_string_case') # type: ignore[assignment]
|
|
@@ -1566,7 +1647,7 @@ class AIGoogleDiffer(DifferBase):
|
|
|
1566
1647
|
directives_text = ''
|
|
1567
1648
|
footer = (
|
|
1568
1649
|
f"Summary by Google Generative AI's model {model_version}{directives_text}"
|
|
1569
|
-
if model_version
|
|
1650
|
+
if model_version or directives_text
|
|
1570
1651
|
else ''
|
|
1571
1652
|
)
|
|
1572
1653
|
temp_unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
|
|
@@ -1586,7 +1667,6 @@ class AIGoogleDiffer(DifferBase):
|
|
|
1586
1667
|
[
|
|
1587
1668
|
mark_to_html(summary, extras={'tables'}).replace('<h2>', '<h3>').replace('</h2>', '</h3>'),
|
|
1588
1669
|
'<br>',
|
|
1589
|
-
'<br>',
|
|
1590
1670
|
unified_report['html'],
|
|
1591
1671
|
]
|
|
1592
1672
|
+ (['-----<br>', f'<i><small>{footer}</small></i>'] if footer else [])
|
|
@@ -34,9 +34,9 @@ try:
|
|
|
34
34
|
from lxml import etree # noqa: S410 insecure use of XML modules, prefer "defusedxml". TODO
|
|
35
35
|
from lxml.cssselect import CSSSelector # noqa: S410 insecure use of XML ... "defusedxml". TODO
|
|
36
36
|
except ImportError as e:
|
|
37
|
-
from xml import etree
|
|
37
|
+
from xml import etree # type: ignore[no-redef]
|
|
38
38
|
|
|
39
|
-
CSSSelector = str(e)
|
|
39
|
+
CSSSelector = str(e) # type: ignore[misc,assignment]
|
|
40
40
|
|
|
41
41
|
# https://stackoverflow.com/questions/712791
|
|
42
42
|
try:
|
|
@@ -90,7 +90,7 @@ except ImportError as e: # pragma: has-pytesseract
|
|
|
90
90
|
pytesseract = str(e) # type: ignore[assignment]
|
|
91
91
|
|
|
92
92
|
try:
|
|
93
|
-
import vobject
|
|
93
|
+
import vobject.base
|
|
94
94
|
except ImportError as e: # pragma: no cover
|
|
95
95
|
vobject = str(e) # type: ignore[assignment]
|
|
96
96
|
|
|
@@ -262,7 +262,7 @@ class FilterBase(metaclass=TrackSubClasses):
|
|
|
262
262
|
:param subfilter: The subfilter information.
|
|
263
263
|
:param job_state: The JobState object (containing the Job).
|
|
264
264
|
:param data: The data upon which to apply the filter.
|
|
265
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
265
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
266
266
|
"""
|
|
267
267
|
logger.info(f'Job {job_state.job.index_number}: Applying filter {filter_kind}, subfilter(s) {subfilter}')
|
|
268
268
|
filtercls: type[FilterBase] | None = cls.__subclasses__.get(filter_kind) # type: ignore[assignment]
|
|
@@ -308,7 +308,7 @@ class FilterBase(metaclass=TrackSubClasses):
|
|
|
308
308
|
|
|
309
309
|
:param data: The data to be filtered (processed).
|
|
310
310
|
:param subfilter: The subfilter information.
|
|
311
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
311
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
312
312
|
"""
|
|
313
313
|
raise NotImplementedError()
|
|
314
314
|
|
|
@@ -354,7 +354,7 @@ class AutoMatchFilter(FilterBase):
|
|
|
354
354
|
|
|
355
355
|
:param data: The data to be filtered (processed).
|
|
356
356
|
:param subfilter: The subfilter information.
|
|
357
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
357
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
358
358
|
"""
|
|
359
359
|
pass
|
|
360
360
|
|
|
@@ -391,7 +391,7 @@ class RegexMatchFilter(FilterBase):
|
|
|
391
391
|
|
|
392
392
|
:param data: The data to be filtered (processed).
|
|
393
393
|
:param subfilter: The subfilter information.
|
|
394
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
394
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
395
395
|
"""
|
|
396
396
|
pass
|
|
397
397
|
|
|
@@ -414,7 +414,7 @@ class BeautifyFilter(FilterBase):
|
|
|
414
414
|
|
|
415
415
|
:param data: The data to be filtered (processed).
|
|
416
416
|
:param subfilter: The subfilter information.
|
|
417
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
417
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
418
418
|
"""
|
|
419
419
|
if isinstance(bs4, str):
|
|
420
420
|
self.raise_import_error('BeautifulSoup', self.__kind__, bs4)
|
|
@@ -462,14 +462,14 @@ class AbsoluteLinksFilter(FilterBase):
|
|
|
462
462
|
def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
|
|
463
463
|
tree = etree.HTML(data)
|
|
464
464
|
elem: etree._Element
|
|
465
|
-
for elem in tree.xpath('//*[@action]'):
|
|
466
|
-
elem.attrib['action'] = urljoin(self.job.url, elem.attrib['action'])
|
|
467
|
-
for elem in tree.xpath('//object[@data]'):
|
|
468
|
-
elem.attrib['data'] = urljoin(self.job.url, elem.attrib['data'])
|
|
469
|
-
for elem in tree.xpath('//*[@href]'):
|
|
470
|
-
elem.attrib['href'] = urljoin(self.job.url, elem.attrib['href'])
|
|
471
|
-
for elem in tree.xpath('//*[@src]'):
|
|
472
|
-
elem.attrib['src'] = urljoin(self.job.url, elem.attrib['src'])
|
|
465
|
+
for elem in tree.xpath('//*[@action]'): # type: ignore[assignment,union-attr]
|
|
466
|
+
elem.attrib['action'] = urljoin(self.job.url, elem.attrib['action']) # type: ignore[type-var,assignment]
|
|
467
|
+
for elem in tree.xpath('//object[@data]'): # type: ignore[assignment,union-attr]
|
|
468
|
+
elem.attrib['data'] = urljoin(self.job.url, elem.attrib['data']) # type: ignore[type-var,assignment]
|
|
469
|
+
for elem in tree.xpath('//*[@href]'): # type: ignore[assignment,union-attr]
|
|
470
|
+
elem.attrib['href'] = urljoin(self.job.url, elem.attrib['href']) # type: ignore[type-var,assignment]
|
|
471
|
+
for elem in tree.xpath('//*[@src]'): # type: ignore[assignment,union-attr]
|
|
472
|
+
elem.attrib['src'] = urljoin(self.job.url, elem.attrib['src']) # type: ignore[type-var,assignment]
|
|
473
473
|
return etree.tostring(tree, encoding='unicode', method='html'), mime_type
|
|
474
474
|
|
|
475
475
|
|
|
@@ -523,7 +523,7 @@ class Html2TextFilter(FilterBase):
|
|
|
523
523
|
|
|
524
524
|
:param data: The data to be filtered (processed).
|
|
525
525
|
:param subfilter: The subfilter information.
|
|
526
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
526
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
527
527
|
"""
|
|
528
528
|
|
|
529
529
|
# extract method and options from subfilter, defaulting to method html2text
|
|
@@ -748,12 +748,12 @@ class Ical2TextFilter(FilterBase):
|
|
|
748
748
|
|
|
749
749
|
result = []
|
|
750
750
|
if isinstance(data, str):
|
|
751
|
-
parsedCal = vobject.readOne(data)
|
|
751
|
+
parsedCal = vobject.base.readOne(data)
|
|
752
752
|
else:
|
|
753
753
|
try:
|
|
754
|
-
parsedCal = vobject.readOne(data)
|
|
755
|
-
except vobject.ParseError:
|
|
756
|
-
parsedCal = vobject.readOne(data.decode(errors='ignore'))
|
|
754
|
+
parsedCal = vobject.base.readOne(data)
|
|
755
|
+
except vobject.base.ParseError:
|
|
756
|
+
parsedCal = vobject.base.readOne(data.decode(errors='ignore'))
|
|
757
757
|
logger.warning('Found and ignored Unicode-related errors when reading iCal entry.')
|
|
758
758
|
|
|
759
759
|
for event in parsedCal.getChildren():
|
|
@@ -797,7 +797,14 @@ class FormatJsonFilter(FilterBase):
|
|
|
797
797
|
try:
|
|
798
798
|
parsed_json = jsonlib.loads(data)
|
|
799
799
|
except jsonlib.JSONDecodeError as e:
|
|
800
|
-
return
|
|
800
|
+
return (
|
|
801
|
+
jsonlib.dumps(
|
|
802
|
+
f"ERROR: Filter '{self.__kind__}' returned 'JSONDecodeError: {e}' on the following data:\n\n"
|
|
803
|
+
f'{data!s}',
|
|
804
|
+
ensure_ascii=False,
|
|
805
|
+
),
|
|
806
|
+
'application/json',
|
|
807
|
+
)
|
|
801
808
|
if not mime_type.endswith('json'):
|
|
802
809
|
mime_type = 'application/json'
|
|
803
810
|
return jsonlib.dumps(parsed_json, ensure_ascii=False, sort_keys=sort_keys, indent=indentation), mime_type
|
|
@@ -908,7 +915,7 @@ class GrepFilter(FilterBase):
|
|
|
908
915
|
|
|
909
916
|
:param data: The data to be filtered (processed).
|
|
910
917
|
:param subfilter: The subfilter information.
|
|
911
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
918
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
912
919
|
"""
|
|
913
920
|
warnings.warn(
|
|
914
921
|
f"The 'grep' filter is deprecated; replace with 'keep_lines_containing' + 're' subfilter"
|
|
@@ -1359,7 +1366,7 @@ class LxmlParser:
|
|
|
1359
1366
|
try:
|
|
1360
1367
|
tree = element.getroottree()
|
|
1361
1368
|
path = tree.getpath(element)
|
|
1362
|
-
return element is not tree.xpath(path, namespaces=self.namespaces)[0]
|
|
1369
|
+
return element is not tree.xpath(path, namespaces=self.namespaces)[0] # type: ignore[index]
|
|
1363
1370
|
except (ValueError, IndexError):
|
|
1364
1371
|
return True
|
|
1365
1372
|
|
|
@@ -1392,11 +1399,22 @@ class LxmlParser:
|
|
|
1392
1399
|
excluded_elems: list[etree._Element] | None = None
|
|
1393
1400
|
try:
|
|
1394
1401
|
if self.filter_kind == 'css':
|
|
1395
|
-
selected_elems = CSSSelector(self.expression, namespaces=self.namespaces)(
|
|
1396
|
-
|
|
1402
|
+
selected_elems = CSSSelector(self.expression, namespaces=self.namespaces)(
|
|
1403
|
+
root
|
|
1404
|
+
) # type: ignore[assignment]
|
|
1405
|
+
excluded_elems = (
|
|
1406
|
+
CSSSelector(self.exclude, namespaces=self.namespaces)(root) # type: ignore[assignment]
|
|
1407
|
+
if self.exclude
|
|
1408
|
+
else None
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1397
1411
|
elif self.filter_kind == 'xpath':
|
|
1398
|
-
selected_elems = root.xpath(self.expression, namespaces=self.namespaces)
|
|
1399
|
-
excluded_elems =
|
|
1412
|
+
selected_elems = root.xpath(self.expression, namespaces=self.namespaces) # type: ignore[assignment]
|
|
1413
|
+
excluded_elems = (
|
|
1414
|
+
root.xpath(self.exclude, namespaces=self.namespaces) # type: ignore[assignment]
|
|
1415
|
+
if self.exclude
|
|
1416
|
+
else None
|
|
1417
|
+
)
|
|
1400
1418
|
except (etree.ParserError, etree.XMLSchemaError, etree.XPathError) as e:
|
|
1401
1419
|
raise ValueError(f'Job {job_index_number} {type(e).__name__}: {e} {self.expression}') from e
|
|
1402
1420
|
if excluded_elems is not None:
|
|
@@ -1545,7 +1563,7 @@ class SortFilter(FilterBase):
|
|
|
1545
1563
|
|
|
1546
1564
|
:param data: The data to be filtered (processed).
|
|
1547
1565
|
:param subfilter: The subfilter information.
|
|
1548
|
-
:returns: The data and MIME type of the data after the filter has been applied.
|
|
1566
|
+
:returns: The data and media type (fka MIME type) of the data after the filter has been applied.
|
|
1549
1567
|
"""
|
|
1550
1568
|
if not isinstance(data, str):
|
|
1551
1569
|
raise ValueError
|
|
@@ -1808,3 +1826,30 @@ class Base64(FilterBase):
|
|
|
1808
1826
|
def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
|
|
1809
1827
|
data_to_encode = data.encode() if isinstance(data, str) else data
|
|
1810
1828
|
return base64.b64encode(data_to_encode).decode(), 'text/plain'
|
|
1829
|
+
|
|
1830
|
+
|
|
1831
|
+
class JsontoYamlFilter(FilterBase):
|
|
1832
|
+
"""Convert JSON to formatted YAML. An alternative to format-json."""
|
|
1833
|
+
|
|
1834
|
+
__kind__ = 'jsontoyaml'
|
|
1835
|
+
|
|
1836
|
+
__supported_subfilters__ = {
|
|
1837
|
+
'indentation': 'Indentation level for pretty-printing',
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
__default_subfilter__ = 'indentation'
|
|
1841
|
+
|
|
1842
|
+
def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
|
|
1843
|
+
self.job.set_to_monospace()
|
|
1844
|
+
indentation = int(subfilter.get('indentation', 2))
|
|
1845
|
+
try:
|
|
1846
|
+
parsed_json = jsonlib.loads(data)
|
|
1847
|
+
except jsonlib.JSONDecodeError as e:
|
|
1848
|
+
return f"Filter '{self.__kind__}' returned JSONDecodeError: {e}\n\n{data!s}", mime_type
|
|
1849
|
+
if isinstance(parsed_json, list):
|
|
1850
|
+
yaml_data = yaml.safe_dump_all(
|
|
1851
|
+
parsed_json, indent=indentation, width=999, allow_unicode=True, line_break='\n'
|
|
1852
|
+
)
|
|
1853
|
+
else:
|
|
1854
|
+
yaml_data = yaml.safe_dump(parsed_json, indent=indentation, width=999, allow_unicode=True, line_break='\n')
|
|
1855
|
+
return yaml_data, 'application/yaml'
|
|
@@ -808,14 +808,14 @@ class UrlJob(UrlJobBase):
|
|
|
808
808
|
if response.status_code == 304:
|
|
809
809
|
raise NotModifiedError(response.status_code)
|
|
810
810
|
|
|
811
|
-
# Save ETag from response
|
|
812
|
-
# header
|
|
813
|
-
# Also save the media-type (MIME type)
|
|
814
|
-
etag = ''
|
|
815
|
-
mime_type = ''
|
|
811
|
+
# Save ETag from response to be used as If-None-Match header in future requests
|
|
816
812
|
if not response.history: # no redirects
|
|
817
813
|
etag = response.headers.get('ETag', '')
|
|
818
|
-
|
|
814
|
+
else:
|
|
815
|
+
logger.info(f'Job {self.index_number}: ETag not captured as response was redirected to {response.url}')
|
|
816
|
+
etag = ''
|
|
817
|
+
# Save the media type (fka MIME type)
|
|
818
|
+
mime_type = response.headers.get('Content-Type', '').split(';')[0]
|
|
819
819
|
|
|
820
820
|
if FilterBase.filter_chain_needs_bytes(self.filters):
|
|
821
821
|
return response.content, etag, mime_type
|
|
@@ -913,14 +913,14 @@ class UrlJob(UrlJobBase):
|
|
|
913
913
|
if response.status_code == 304:
|
|
914
914
|
raise NotModifiedError(response.status_code)
|
|
915
915
|
|
|
916
|
-
# Save ETag from response
|
|
917
|
-
# header
|
|
918
|
-
# Also save the media-type (MIME type)
|
|
919
|
-
etag = ''
|
|
920
|
-
mime_type = ''
|
|
916
|
+
# Save ETag from response to be used as If-None-Match header in future requests
|
|
921
917
|
if not response.history: # no redirects
|
|
922
918
|
etag = response.headers.get('ETag', '')
|
|
923
|
-
|
|
919
|
+
else:
|
|
920
|
+
logger.info(f'Job {self.index_number}: ETag not captured as response was redirected to {response.url}')
|
|
921
|
+
etag = ''
|
|
922
|
+
# Save the media type (fka MIME type)
|
|
923
|
+
mime_type = response.headers.get('Content-Type', '').split(';')[0]
|
|
924
924
|
|
|
925
925
|
if FilterBase.filter_chain_needs_bytes(self.filters):
|
|
926
926
|
return response.content, etag, mime_type
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webchanges
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.30.0
|
|
4
4
|
Summary: Web Changes Delivered. AI-Summarized. Totally Anonymous.
|
|
5
5
|
Author-email: Mike Borsetti <mike+webchanges@borsetti.com>
|
|
6
6
|
Maintainer-email: Mike Borsetti <mike+webchanges@borsetti.com>
|
|
@@ -206,8 +206,9 @@ Install **webchanges** with:
|
|
|
206
206
|
|
|
207
207
|
Running in Docker
|
|
208
208
|
-----------------
|
|
209
|
-
**webchanges** can easily run in a
|
|
210
|
-
`here <https://github.com/yubiuser/webchanges-docker>`__
|
|
209
|
+
**webchanges** can easily run in a `Docker <https://www.docker.com/>`__ container! You will find a minimal
|
|
210
|
+
implementation (no browser) `here <https://github.com/yubiuser/webchanges-docker>`__, and one with a browser
|
|
211
|
+
`here <https://github.com/jhedlund/webchanges-docker>`__.
|
|
211
212
|
|
|
212
213
|
|
|
213
214
|
Documentation |readthedocs|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|