webchanges 3.29.0__tar.gz → 3.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {webchanges-3.29.0/webchanges.egg-info → webchanges-3.30.0}/PKG-INFO +4 -3
  2. {webchanges-3.29.0 → webchanges-3.30.0}/README.rst +3 -2
  3. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/__init__.py +1 -1
  4. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/cli.py +11 -6
  5. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/config.py +2 -1
  6. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/differs.py +93 -13
  7. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/filters.py +74 -29
  8. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/jobs.py +12 -12
  9. {webchanges-3.29.0 → webchanges-3.30.0/webchanges.egg-info}/PKG-INFO +4 -3
  10. {webchanges-3.29.0 → webchanges-3.30.0}/LICENSE +0 -0
  11. {webchanges-3.29.0 → webchanges-3.30.0}/MANIFEST.in +0 -0
  12. {webchanges-3.29.0 → webchanges-3.30.0}/pyproject.toml +0 -0
  13. {webchanges-3.29.0 → webchanges-3.30.0}/requirements.txt +0 -0
  14. {webchanges-3.29.0 → webchanges-3.30.0}/setup.cfg +0 -0
  15. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/__main__.py +0 -0
  16. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/_vendored/__init__.py +0 -0
  17. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/_vendored/headers.py +0 -0
  18. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/_vendored/packaging_version.py +0 -0
  19. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/command.py +0 -0
  20. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/handler.py +0 -0
  21. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/mailer.py +0 -0
  22. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/main.py +0 -0
  23. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/py.typed +0 -0
  24. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/reporters.py +0 -0
  25. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/storage.py +0 -0
  26. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/storage_minidb.py +0 -0
  27. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/util.py +0 -0
  28. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges/worker.py +0 -0
  29. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/SOURCES.txt +0 -0
  30. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/dependency_links.txt +0 -0
  31. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/entry_points.txt +0 -0
  32. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/requires.txt +0 -0
  33. {webchanges-3.29.0 → webchanges-3.30.0}/webchanges.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webchanges
3
- Version: 3.29.0
3
+ Version: 3.30.0
4
4
  Summary: Web Changes Delivered. AI-Summarized. Totally Anonymous.
5
5
  Author-email: Mike Borsetti <mike+webchanges@borsetti.com>
6
6
  Maintainer-email: Mike Borsetti <mike+webchanges@borsetti.com>
@@ -206,8 +206,9 @@ Install **webchanges** with:
206
206
 
207
207
  Running in Docker
208
208
  -----------------
209
- **webchanges** can easily run in a container and you will find a `Docker <https://www.docker.com/>`__ implementation
210
- `here <https://github.com/yubiuser/webchanges-docker>`__.
209
+ **webchanges** can easily run in a `Docker <https://www.docker.com/>`__ container! You will find a minimal
210
+ implementation (no browser) `here <https://github.com/yubiuser/webchanges-docker>`__, and one with a browser
211
+ `here <https://github.com/jhedlund/webchanges-docker>`__.
211
212
 
212
213
 
213
214
  Documentation |readthedocs|
@@ -39,8 +39,9 @@ Install **webchanges** with:
39
39
 
40
40
  Running in Docker
41
41
  -----------------
42
- **webchanges** can easily run in a container and you will find a `Docker <https://www.docker.com/>`__ implementation
43
- `here <https://github.com/yubiuser/webchanges-docker>`__.
42
+ **webchanges** can easily run in a `Docker <https://www.docker.com/>`__ container! You will find a minimal
43
+ implementation (no browser) `here <https://github.com/yubiuser/webchanges-docker>`__, and one with a browser
44
+ `here <https://github.com/jhedlund/webchanges-docker>`__.
44
45
 
45
46
 
46
47
  Documentation |readthedocs|
@@ -22,7 +22,7 @@ __project_name__ = __package__
22
22
  # * MINOR version when you add functionality in a backwards compatible manner, and
23
23
  # * MICRO or PATCH version when you make backwards compatible bug fixes. We no longer use '0'
24
24
  # If unsure on increments, use pkg_resources.parse_version to parse
25
- __version__ = '3.29.0'
25
+ __version__ = '3.30.0'
26
26
  __description__ = (
27
27
  'Check web (or command output) for changes since last run and notify.\n'
28
28
  '\n'
@@ -241,11 +241,16 @@ def first_run(command_config: CommandConfig) -> None:
241
241
  print(f'> Edit it with {__project_name__} --edit')
242
242
 
243
243
 
244
- def load_hooks(hooks_file: Path) -> None:
244
+ def load_hooks(hooks_file: Path, is_default: bool = False) -> None:
245
245
  """Load hooks file."""
246
246
  if not hooks_file.is_file():
247
- # do not use ImportWarning as it could be suppressed
248
- warnings.warn(f'Hooks file not imported because {hooks_file} is not a file', RuntimeWarning)
247
+ if is_default:
248
+ logger.info(f'Hooks file {hooks_file} does not exist or is not a file')
249
+ else:
250
+ # do not use ImportWarning as it could be suppressed
251
+ warnings.warn(
252
+ f'Hooks file {hooks_file} not imported because it does not exist or is not a file', RuntimeWarning
253
+ )
249
254
  return
250
255
 
251
256
  hooks_file_errors = file_ownership_checks(hooks_file)
@@ -258,9 +263,9 @@ def load_hooks(hooks_file: Path) -> None:
258
263
  RuntimeWarning,
259
264
  )
260
265
  else:
261
- logger.info(f'Importing hooks module from {hooks_file}')
266
+ logger.info(f'Importing into hooks module from {hooks_file}')
262
267
  import_module_from_source('hooks', hooks_file)
263
- logger.info('Finished importing hooks module')
268
+ logger.info('Finished importing into hooks module')
264
269
 
265
270
 
266
271
  def handle_unitialized_actions(urlwatch_config: CommandConfig) -> None:
@@ -406,7 +411,7 @@ def main() -> None: # pragma: no cover
406
411
  if command_config.hooks_files:
407
412
  logger.debug(f'Hooks files to be loaded: {command_config.hooks_files}')
408
413
  for hooks_file in command_config.hooks_files:
409
- load_hooks(hooks_file)
414
+ load_hooks(hooks_file, is_default=not command_config.hooks_files_inputted)
410
415
  config_storage.load()
411
416
 
412
417
  # Setup database API
@@ -50,6 +50,7 @@ class CommandConfig(BaseConfig):
50
50
  footnote: str | None
51
51
  gc_database: int | None
52
52
  hooks_files: list[Path]
53
+ hooks_files_inputted: bool
53
54
  install_chrome: bool
54
55
  joblist: Collection[str | int]
55
56
  jobs_files: list[Path]
@@ -89,6 +90,7 @@ class CommandConfig(BaseConfig):
89
90
  super().__init__(config_path, config_file, jobs_def_file, hooks_def_file, ssdb_file)
90
91
  self.parse_args(args)
91
92
  self.jobs_files = self.jobs_files or [jobs_def_file]
93
+ self.hooks_files_inputted = bool(self.hooks_files)
92
94
  self.hooks_files = self.hooks_files or [hooks_def_file]
93
95
 
94
96
  class CustomHelpFormatter(argparse.RawDescriptionHelpFormatter):
@@ -145,7 +147,6 @@ class CommandConfig(BaseConfig):
145
147
  '--jobs',
146
148
  '--urls',
147
149
  action='append',
148
- # default=[self.jobs_def_file],
149
150
  type=Path,
150
151
  help='read job list (URLs/commands) from FILE or files matching a glob pattern',
151
152
  metavar='FILE',
@@ -27,6 +27,7 @@ from typing import Any, Iterator, Literal, TYPE_CHECKING, TypedDict
27
27
  from zoneinfo import ZoneInfo
28
28
 
29
29
  import html2text
30
+ import yaml
30
31
 
31
32
  from webchanges.jobs import JobBase
32
33
  from webchanges.util import linkify, mark_to_html, TrackSubClasses
@@ -64,9 +65,11 @@ except ImportError: # pragma: no cover
64
65
  import json as jsonlib # type: ignore[no-redef]
65
66
 
66
67
  try:
68
+ from xml.parsers.expat import ExpatError
69
+
67
70
  import xmltodict
68
71
  except ImportError as e: # pragma: no cover
69
- xmltodict = str(e) # type: ignore[no-redef]
72
+ xmltodict = str(e) # type: ignore[no-redef,assignment]
70
73
 
71
74
  # https://stackoverflow.com/questions/39740632
72
75
  if TYPE_CHECKING:
@@ -743,7 +746,7 @@ class DeepdiffDiffer(DifferBase):
743
746
  __kind__ = 'deepdiff'
744
747
 
745
748
  __supported_directives__ = {
746
- 'data_type': "either 'json' (default) or 'xml'",
749
+ 'data_type': "either 'json' (default), 'yaml', or 'xml'",
747
750
  'ignore_order': 'Whether to ignore the order in which the items have appeared (default: false)',
748
751
  'ignore_string_case': 'Whether to be case-sensitive or not when comparing strings (default: false)',
749
752
  'significant_digits': (
@@ -859,14 +862,70 @@ class DeepdiffDiffer(DifferBase):
859
862
 
860
863
  return '\n'.join(result)
861
864
 
862
- data_type = directives.get('data_type', 'json')
863
- old_data = ''
864
- new_data = ''
865
- if data_type == 'json':
865
+ if directives.get('data_type'):
866
+ old_data_type = directives['data_type']
867
+ new_data_type = directives['data_type']
868
+ else:
869
+ if self.state.old_mime_type:
870
+ media_subtype = self.state.old_mime_type.split('/')[-1].split('+')[-1].split('x-')[-1]
871
+ if media_subtype in ('yaml', 'yml'):
872
+ old_data_type = 'yaml'
873
+ elif media_subtype == 'xml':
874
+ old_data_type = 'xml'
875
+ elif media_subtype == 'json':
876
+ old_data_type = 'json'
877
+ else:
878
+ logger.info(
879
+ f'Differ {self.__kind__} could not determine data type of old data from media type '
880
+ f"{self.state.old_mime_type}; defaulting to 'json'"
881
+ )
882
+ old_data_type = 'json'
883
+ else:
884
+ logger.info(
885
+ f"Differ {self.__kind__} data_type for old data defaulted to 'json' as media type is missing"
886
+ )
887
+ old_data_type = 'json'
888
+ if self.state.new_mime_type:
889
+ media_subtype = self.state.new_mime_type.split('/')[-1].split('+')[-1].split('x-')[-1]
890
+ if media_subtype in ('yaml', 'yml'):
891
+ new_data_type = 'yaml'
892
+ elif media_subtype == 'xml':
893
+ new_data_type = 'xml'
894
+ elif media_subtype == 'json':
895
+ new_data_type = 'json'
896
+ else:
897
+ logger.info(
898
+ f'Differ {self.__kind__} could not determine data type of new data from media type '
899
+ f"{self.state.new_mime_type}; defaulting to 'json'"
900
+ )
901
+ new_data_type = 'json'
902
+ else:
903
+ logger.info(
904
+ f"Differ {self.__kind__} data_type for new data defaulted to 'json' as media type is missing"
905
+ )
906
+ new_data_type = 'json'
907
+
908
+ old_data: Any = ''
909
+ if old_data_type == 'json':
866
910
  try:
867
911
  old_data = jsonlib.loads(self.state.old_data)
868
912
  except jsonlib.JSONDecodeError:
869
- old_data = ''
913
+ pass
914
+ elif old_data_type == 'yaml':
915
+ try:
916
+ old_data = yaml.safe_load(self.state.old_data)
917
+ except yaml.YAMLError:
918
+ pass
919
+ elif old_data_type == 'xml':
920
+ if isinstance(xmltodict, str): # pragma: no cover
921
+ self.raise_import_error('xmltodict', xmltodict)
922
+ try:
923
+ old_data = xmltodict.parse(self.state.old_data)
924
+ except ExpatError:
925
+ pass
926
+
927
+ new_data: Any = ''
928
+ if new_data_type == 'json':
870
929
  try:
871
930
  new_data = jsonlib.loads(self.state.new_data)
872
931
  except jsonlib.JSONDecodeError as e:
@@ -879,12 +938,34 @@ class DeepdiffDiffer(DifferBase):
879
938
  'markdown': f'Differ {self.__kind__} **ERROR: New data is invalid JSON**\n{e}',
880
939
  'html': f'Differ {self.__kind__} <b>ERROR: New data is invalid JSON</b>\n{e}',
881
940
  }
882
- elif data_type == 'xml':
941
+ elif new_data_type == 'yaml':
942
+ try:
943
+ new_data = yaml.safe_load(self.state.new_data)
944
+ except yaml.YAMLError as e:
945
+ self.state.exception = e
946
+ self.state.traceback = self.job.format_error(e, traceback.format_exc())
947
+ logger.error(f'Job {self.job.index_number}: New data is invalid YAML: {e} ({self.job.get_location()})')
948
+ logger.info(f'Job {self.job.index_number}: {self.state.new_data!r}')
949
+ return {
950
+ 'text': f'Differ {self.__kind__} ERROR: New data is invalid YAML\n{e}',
951
+ 'markdown': f'Differ {self.__kind__} **ERROR: New data is invalid YAML**\n{e}',
952
+ 'html': f'Differ {self.__kind__} <b>ERROR: New data is invalid YAML</b>\n{e}',
953
+ }
954
+ elif new_data_type == 'xml':
883
955
  if isinstance(xmltodict, str): # pragma: no cover
884
956
  self.raise_import_error('xmltodict', xmltodict)
885
-
886
- old_data = xmltodict.parse(self.state.old_data)
887
- new_data = xmltodict.parse(self.state.new_data)
957
+ try:
958
+ new_data = xmltodict.parse(self.state.new_data)
959
+ except ExpatError as e:
960
+ self.state.exception = e
961
+ self.state.traceback = self.job.format_error(e, traceback.format_exc())
962
+ logger.error(f'Job {self.job.index_number}: New data is invalid XML: {e} ({self.job.get_location()})')
963
+ logger.info(f'Job {self.job.index_number}: {self.state.new_data!r}')
964
+ return {
965
+ 'text': f'Differ {self.__kind__} ERROR: New data is invalid XML\n{e}',
966
+ 'markdown': f'Differ {self.__kind__} **ERROR: New data is invalid XML**\n{e}',
967
+ 'html': f'Differ {self.__kind__} <b>ERROR: New data is invalid XML</b>\n{e}',
968
+ }
888
969
 
889
970
  ignore_order: bool = directives.get('ignore_order') # type: ignore[assignment]
890
971
  ignore_string_case: bool = directives.get('ignore_string_case') # type: ignore[assignment]
@@ -1566,7 +1647,7 @@ class AIGoogleDiffer(DifferBase):
1566
1647
  directives_text = ''
1567
1648
  footer = (
1568
1649
  f"Summary by Google Generative AI's model {model_version}{directives_text}"
1569
- if model_version and directives_text
1650
+ if model_version or directives_text
1570
1651
  else ''
1571
1652
  )
1572
1653
  temp_unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
@@ -1586,7 +1667,6 @@ class AIGoogleDiffer(DifferBase):
1586
1667
  [
1587
1668
  mark_to_html(summary, extras={'tables'}).replace('<h2>', '<h3>').replace('</h2>', '</h3>'),
1588
1669
  '<br>',
1589
- '<br>',
1590
1670
  unified_report['html'],
1591
1671
  ]
1592
1672
  + (['-----<br>', f'<i><small>{footer}</small></i>'] if footer else [])
@@ -34,9 +34,9 @@ try:
34
34
  from lxml import etree # noqa: S410 insecure use of XML modules, prefer "defusedxml". TODO
35
35
  from lxml.cssselect import CSSSelector # noqa: S410 insecure use of XML ... "defusedxml". TODO
36
36
  except ImportError as e:
37
- from xml import etree
37
+ from xml import etree # type: ignore[no-redef]
38
38
 
39
- CSSSelector = str(e)
39
+ CSSSelector = str(e) # type: ignore[misc,assignment]
40
40
 
41
41
  # https://stackoverflow.com/questions/712791
42
42
  try:
@@ -90,7 +90,7 @@ except ImportError as e: # pragma: has-pytesseract
90
90
  pytesseract = str(e) # type: ignore[assignment]
91
91
 
92
92
  try:
93
- import vobject
93
+ import vobject.base
94
94
  except ImportError as e: # pragma: no cover
95
95
  vobject = str(e) # type: ignore[assignment]
96
96
 
@@ -262,7 +262,7 @@ class FilterBase(metaclass=TrackSubClasses):
262
262
  :param subfilter: The subfilter information.
263
263
  :param job_state: The JobState object (containing the Job).
264
264
  :param data: The data upon which to apply the filter.
265
- :returns: The data and MIME type of the data after the filter has been applied.
265
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
266
266
  """
267
267
  logger.info(f'Job {job_state.job.index_number}: Applying filter {filter_kind}, subfilter(s) {subfilter}')
268
268
  filtercls: type[FilterBase] | None = cls.__subclasses__.get(filter_kind) # type: ignore[assignment]
@@ -308,7 +308,7 @@ class FilterBase(metaclass=TrackSubClasses):
308
308
 
309
309
  :param data: The data to be filtered (processed).
310
310
  :param subfilter: The subfilter information.
311
- :returns: The data and MIME type of the data after the filter has been applied.
311
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
312
312
  """
313
313
  raise NotImplementedError()
314
314
 
@@ -354,7 +354,7 @@ class AutoMatchFilter(FilterBase):
354
354
 
355
355
  :param data: The data to be filtered (processed).
356
356
  :param subfilter: The subfilter information.
357
- :returns: The data and MIME type of the data after the filter has been applied.
357
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
358
358
  """
359
359
  pass
360
360
 
@@ -391,7 +391,7 @@ class RegexMatchFilter(FilterBase):
391
391
 
392
392
  :param data: The data to be filtered (processed).
393
393
  :param subfilter: The subfilter information.
394
- :returns: The data and MIME type of the data after the filter has been applied.
394
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
395
395
  """
396
396
  pass
397
397
 
@@ -414,7 +414,7 @@ class BeautifyFilter(FilterBase):
414
414
 
415
415
  :param data: The data to be filtered (processed).
416
416
  :param subfilter: The subfilter information.
417
- :returns: The data and MIME type of the data after the filter has been applied.
417
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
418
418
  """
419
419
  if isinstance(bs4, str):
420
420
  self.raise_import_error('BeautifulSoup', self.__kind__, bs4)
@@ -462,14 +462,14 @@ class AbsoluteLinksFilter(FilterBase):
462
462
  def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
463
463
  tree = etree.HTML(data)
464
464
  elem: etree._Element
465
- for elem in tree.xpath('//*[@action]'):
466
- elem.attrib['action'] = urljoin(self.job.url, elem.attrib['action'])
467
- for elem in tree.xpath('//object[@data]'):
468
- elem.attrib['data'] = urljoin(self.job.url, elem.attrib['data'])
469
- for elem in tree.xpath('//*[@href]'):
470
- elem.attrib['href'] = urljoin(self.job.url, elem.attrib['href'])
471
- for elem in tree.xpath('//*[@src]'):
472
- elem.attrib['src'] = urljoin(self.job.url, elem.attrib['src'])
465
+ for elem in tree.xpath('//*[@action]'): # type: ignore[assignment,union-attr]
466
+ elem.attrib['action'] = urljoin(self.job.url, elem.attrib['action']) # type: ignore[type-var,assignment]
467
+ for elem in tree.xpath('//object[@data]'): # type: ignore[assignment,union-attr]
468
+ elem.attrib['data'] = urljoin(self.job.url, elem.attrib['data']) # type: ignore[type-var,assignment]
469
+ for elem in tree.xpath('//*[@href]'): # type: ignore[assignment,union-attr]
470
+ elem.attrib['href'] = urljoin(self.job.url, elem.attrib['href']) # type: ignore[type-var,assignment]
471
+ for elem in tree.xpath('//*[@src]'): # type: ignore[assignment,union-attr]
472
+ elem.attrib['src'] = urljoin(self.job.url, elem.attrib['src']) # type: ignore[type-var,assignment]
473
473
  return etree.tostring(tree, encoding='unicode', method='html'), mime_type
474
474
 
475
475
 
@@ -523,7 +523,7 @@ class Html2TextFilter(FilterBase):
523
523
 
524
524
  :param data: The data to be filtered (processed).
525
525
  :param subfilter: The subfilter information.
526
- :returns: The data and MIME type of the data after the filter has been applied.
526
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
527
527
  """
528
528
 
529
529
  # extract method and options from subfilter, defaulting to method html2text
@@ -748,12 +748,12 @@ class Ical2TextFilter(FilterBase):
748
748
 
749
749
  result = []
750
750
  if isinstance(data, str):
751
- parsedCal = vobject.readOne(data)
751
+ parsedCal = vobject.base.readOne(data)
752
752
  else:
753
753
  try:
754
- parsedCal = vobject.readOne(data)
755
- except vobject.ParseError:
756
- parsedCal = vobject.readOne(data.decode(errors='ignore'))
754
+ parsedCal = vobject.base.readOne(data)
755
+ except vobject.base.ParseError:
756
+ parsedCal = vobject.base.readOne(data.decode(errors='ignore'))
757
757
  logger.warning('Found and ignored Unicode-related errors when reading iCal entry.')
758
758
 
759
759
  for event in parsedCal.getChildren():
@@ -797,7 +797,14 @@ class FormatJsonFilter(FilterBase):
797
797
  try:
798
798
  parsed_json = jsonlib.loads(data)
799
799
  except jsonlib.JSONDecodeError as e:
800
- return f"Filter '{self.__kind__}' returned JSONDecodeError: {e}\n\n{data!s}", mime_type
800
+ return (
801
+ jsonlib.dumps(
802
+ f"ERROR: Filter '{self.__kind__}' returned 'JSONDecodeError: {e}' on the following data:\n\n"
803
+ f'{data!s}',
804
+ ensure_ascii=False,
805
+ ),
806
+ 'application/json',
807
+ )
801
808
  if not mime_type.endswith('json'):
802
809
  mime_type = 'application/json'
803
810
  return jsonlib.dumps(parsed_json, ensure_ascii=False, sort_keys=sort_keys, indent=indentation), mime_type
@@ -908,7 +915,7 @@ class GrepFilter(FilterBase):
908
915
 
909
916
  :param data: The data to be filtered (processed).
910
917
  :param subfilter: The subfilter information.
911
- :returns: The data and MIME type of the data after the filter has been applied.
918
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
912
919
  """
913
920
  warnings.warn(
914
921
  f"The 'grep' filter is deprecated; replace with 'keep_lines_containing' + 're' subfilter"
@@ -1359,7 +1366,7 @@ class LxmlParser:
1359
1366
  try:
1360
1367
  tree = element.getroottree()
1361
1368
  path = tree.getpath(element)
1362
- return element is not tree.xpath(path, namespaces=self.namespaces)[0]
1369
+ return element is not tree.xpath(path, namespaces=self.namespaces)[0] # type: ignore[index]
1363
1370
  except (ValueError, IndexError):
1364
1371
  return True
1365
1372
 
@@ -1392,11 +1399,22 @@ class LxmlParser:
1392
1399
  excluded_elems: list[etree._Element] | None = None
1393
1400
  try:
1394
1401
  if self.filter_kind == 'css':
1395
- selected_elems = CSSSelector(self.expression, namespaces=self.namespaces)(root)
1396
- excluded_elems = CSSSelector(self.exclude, namespaces=self.namespaces)(root) if self.exclude else None
1402
+ selected_elems = CSSSelector(self.expression, namespaces=self.namespaces)(
1403
+ root
1404
+ ) # type: ignore[assignment]
1405
+ excluded_elems = (
1406
+ CSSSelector(self.exclude, namespaces=self.namespaces)(root) # type: ignore[assignment]
1407
+ if self.exclude
1408
+ else None
1409
+ )
1410
+
1397
1411
  elif self.filter_kind == 'xpath':
1398
- selected_elems = root.xpath(self.expression, namespaces=self.namespaces)
1399
- excluded_elems = root.xpath(self.exclude, namespaces=self.namespaces) if self.exclude else None
1412
+ selected_elems = root.xpath(self.expression, namespaces=self.namespaces) # type: ignore[assignment]
1413
+ excluded_elems = (
1414
+ root.xpath(self.exclude, namespaces=self.namespaces) # type: ignore[assignment]
1415
+ if self.exclude
1416
+ else None
1417
+ )
1400
1418
  except (etree.ParserError, etree.XMLSchemaError, etree.XPathError) as e:
1401
1419
  raise ValueError(f'Job {job_index_number} {type(e).__name__}: {e} {self.expression}') from e
1402
1420
  if excluded_elems is not None:
@@ -1545,7 +1563,7 @@ class SortFilter(FilterBase):
1545
1563
 
1546
1564
  :param data: The data to be filtered (processed).
1547
1565
  :param subfilter: The subfilter information.
1548
- :returns: The data and MIME type of the data after the filter has been applied.
1566
+ :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
1549
1567
  """
1550
1568
  if not isinstance(data, str):
1551
1569
  raise ValueError
@@ -1808,3 +1826,30 @@ class Base64(FilterBase):
1808
1826
  def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
1809
1827
  data_to_encode = data.encode() if isinstance(data, str) else data
1810
1828
  return base64.b64encode(data_to_encode).decode(), 'text/plain'
1829
+
1830
+
1831
+ class JsontoYamlFilter(FilterBase):
1832
+ """Convert JSON to formatted YAML. An alternative to format-json."""
1833
+
1834
+ __kind__ = 'jsontoyaml'
1835
+
1836
+ __supported_subfilters__ = {
1837
+ 'indentation': 'Indentation level for pretty-printing',
1838
+ }
1839
+
1840
+ __default_subfilter__ = 'indentation'
1841
+
1842
+ def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
1843
+ self.job.set_to_monospace()
1844
+ indentation = int(subfilter.get('indentation', 2))
1845
+ try:
1846
+ parsed_json = jsonlib.loads(data)
1847
+ except jsonlib.JSONDecodeError as e:
1848
+ return f"Filter '{self.__kind__}' returned JSONDecodeError: {e}\n\n{data!s}", mime_type
1849
+ if isinstance(parsed_json, list):
1850
+ yaml_data = yaml.safe_dump_all(
1851
+ parsed_json, indent=indentation, width=999, allow_unicode=True, line_break='\n'
1852
+ )
1853
+ else:
1854
+ yaml_data = yaml.safe_dump(parsed_json, indent=indentation, width=999, allow_unicode=True, line_break='\n')
1855
+ return yaml_data, 'application/yaml'
@@ -808,14 +808,14 @@ class UrlJob(UrlJobBase):
808
808
  if response.status_code == 304:
809
809
  raise NotModifiedError(response.status_code)
810
810
 
811
- # Save ETag from response into job_state; is saved in database and used in future requests in If-None-Match
812
- # header
813
- # Also save the media-type (MIME type)
814
- etag = ''
815
- mime_type = ''
811
+ # Save ETag from response to be used as If-None-Match header in future requests
816
812
  if not response.history: # no redirects
817
813
  etag = response.headers.get('ETag', '')
818
- mime_type = response.headers.get('Content-Type', '').split(';')[0]
814
+ else:
815
+ logger.info(f'Job {self.index_number}: ETag not captured as response was redirected to {response.url}')
816
+ etag = ''
817
+ # Save the media type (fka MIME type)
818
+ mime_type = response.headers.get('Content-Type', '').split(';')[0]
819
819
 
820
820
  if FilterBase.filter_chain_needs_bytes(self.filters):
821
821
  return response.content, etag, mime_type
@@ -913,14 +913,14 @@ class UrlJob(UrlJobBase):
913
913
  if response.status_code == 304:
914
914
  raise NotModifiedError(response.status_code)
915
915
 
916
- # Save ETag from response into job_state; is saved in database and used in future requests in If-None-Match
917
- # header
918
- # Also save the media-type (MIME type)
919
- etag = ''
920
- mime_type = ''
916
+ # Save ETag from response to be used as If-None-Match header in future requests
921
917
  if not response.history: # no redirects
922
918
  etag = response.headers.get('ETag', '')
923
- mime_type = response.headers.get('Content-Type', '').split(';')[0]
919
+ else:
920
+ logger.info(f'Job {self.index_number}: ETag not captured as response was redirected to {response.url}')
921
+ etag = ''
922
+ # Save the media type (fka MIME type)
923
+ mime_type = response.headers.get('Content-Type', '').split(';')[0]
924
924
 
925
925
  if FilterBase.filter_chain_needs_bytes(self.filters):
926
926
  return response.content, etag, mime_type
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webchanges
3
- Version: 3.29.0
3
+ Version: 3.30.0
4
4
  Summary: Web Changes Delivered. AI-Summarized. Totally Anonymous.
5
5
  Author-email: Mike Borsetti <mike+webchanges@borsetti.com>
6
6
  Maintainer-email: Mike Borsetti <mike+webchanges@borsetti.com>
@@ -206,8 +206,9 @@ Install **webchanges** with:
206
206
 
207
207
  Running in Docker
208
208
  -----------------
209
- **webchanges** can easily run in a container and you will find a `Docker <https://www.docker.com/>`__ implementation
210
- `here <https://github.com/yubiuser/webchanges-docker>`__.
209
+ **webchanges** can easily run in a `Docker <https://www.docker.com/>`__ container! You will find a minimal
210
+ implementation (no browser) `here <https://github.com/yubiuser/webchanges-docker>`__, and one with a browser
211
+ `here <https://github.com/jhedlund/webchanges-docker>`__.
211
212
 
212
213
 
213
214
  Documentation |readthedocs|
File without changes
File without changes
File without changes
File without changes