matterbak 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
matterbak/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """
2
+ main entry point for the matterbak
3
+ """
4
+
5
+ # We intentionally do not export any objects, because the package should only
6
+ # be use as executable script
@@ -0,0 +1,148 @@
1
+ """
2
+ Provide class ChannelData
3
+ """
4
+
5
+
6
+ import json
7
+ import pathlib as pl
8
+
9
+ from . import dump
10
+
11
+ files_subdir = pl.Path('files')
12
+
13
+
14
+ class ChannelData:
15
+ # pylint: disable = too-few-public-methods, too-many-instance-attributes
16
+ """Class to store channel data and back it up"""
17
+
18
+ def __init__(self, init, name, channel, channels_dir):
19
+ """Init
20
+
21
+ init: the Init instance
22
+ name: name for the channel data file and its subdir
23
+ channel: channel data
24
+ channels_dir: pathlib.Path with the dir to store the data in
25
+ """
26
+ self.init = init
27
+ self.name = name
28
+ self.channel = channel
29
+ if 'id' not in self.channel:
30
+ raise KeyError("key 'id' not found in channel")
31
+ self.channels_dir = channels_dir
32
+ self._threads_filename = f"{self.name}{dump.FILENAME_SEPARATOR}{dump.SUFFIX_THREADS}"
33
+ self.posts_dir = (self.channels_dir /
34
+ dump.make_filename(self.channel['id'], name=self.name))
35
+ self.files_dir = self.posts_dir / files_subdir
36
+ self.files_dir.mkdir(parents=True, exist_ok=True)
37
+ self._load_threads()
38
+
39
+ def _get_latest_post_id(self):
40
+ """Return latest ID of posts in posts_dir
41
+
42
+ This function assumes that the file names begin with a timestamps, such that
43
+ the latest post has the lexicographically highest name.
44
+
45
+ posts_dir: pathlib.Path of a dir with json files containing posts data
46
+
47
+ return: post ID contained in the file with the max file name or None
48
+ """
49
+ latest_post_file = self.posts_dir / ' '
50
+ for post_file in self.posts_dir.iterdir():
51
+ if post_file.suffix.lower() != dump.JSON_EXTENSION:
52
+ continue
53
+ if post_file.name > latest_post_file.name:
54
+ latest_post_file = post_file
55
+
56
+ if latest_post_file.exists():
57
+ with latest_post_file.open(encoding="utf8") as post_file:
58
+ post = json.load(post_file)
59
+ return post.get('id')
60
+
61
+ return None
62
+
63
+ def _load_threads(self):
64
+ """Load thread data from backup"""
65
+ self._threads = {}
66
+
67
+ threads_path = (self.channels_dir /
68
+ dump.make_filename(
69
+ self.channel['id'], name=self._threads_filename, extension=dump.JSON_EXTENSION))
70
+ if threads_path.is_file():
71
+ with threads_path.open(encoding="utf8") as threads_file:
72
+ threads_json = json.load(threads_file)
73
+ # Has the file the new format with root_ids as keys?
74
+ # (The old file contained a list of lists.)
75
+ # If not ignore loaded file. It will be overwritten with the new format.
76
+ if isinstance(threads_json, dict):
77
+ self._threads = {root_id: set(post_ids)
78
+ for root_id, post_ids in threads_json.items()}
79
+
80
+ def _save_post(self, post):
81
+ """Backup a post and its files"""
82
+ num_files = 0
83
+ for file_desc in post["metadata"].get("files", []):
84
+ file_id = file_desc["id"]
85
+ dump.dump_content(self.files_dir, file_desc)
86
+ file_respone = self.init.matter.get_file(file_id)
87
+ if file_respone.ok:
88
+ # extension is contained in name
89
+ file_dump_path = (self.files_dir /
90
+ dump.make_filename(file_id, name=file_desc['name']))
91
+ file_dump_path.write_bytes(file_respone.content)
92
+ num_files += 1
93
+ else:
94
+ print(
95
+ f"Cannot retrieve the file '{file_desc['name']}'"
96
+ f"posted to channel '{self.name}': {file_respone.text}")
97
+ return num_files
98
+
99
+ def _update_threads(self, post):
100
+ """Update thread data with new post"""
101
+ root_id = post['root_id']
102
+ if root_id:
103
+ if root_id not in self._threads:
104
+ self._threads[root_id] = set()
105
+ self._threads[root_id].add(post['id'])
106
+
107
+ def backup(self):
108
+ """Download channel data and all its posts and files"""
109
+
110
+ dump.dump_content(self.channels_dir, self.channel, name=self.name)
111
+
112
+ members = self.init.users.get_group_members(self.channel)
113
+ dump.dump_content(
114
+ self.channels_dir, members, id_=self.channel['id'],
115
+ name=f"{self.name}{dump.FILENAME_SEPARATOR}{dump.SUFFIX_MEMBERS}")
116
+
117
+ if self.init.options.update_old_posts:
118
+ latest_id = None
119
+ else:
120
+ latest_id = self._get_latest_post_id()
121
+
122
+ num_posts = 0
123
+ num_files = 0
124
+ for post in self.init.matter.get_posts_for_channel(self.channel['id'], after=latest_id):
125
+ self.init.rate_limiter.wait()
126
+ proggress_symbol = '.'
127
+ old_content = dump.dump_content(
128
+ self.posts_dir, post, with_timestamp=True, return_old_content=True)
129
+ if (not old_content) or (old_content['update_at'] < post['update_at']):
130
+ proggress_symbol = '+'
131
+ num_posts += 1
132
+ num_files += self._save_post(post)
133
+
134
+ # We update the threads in any case although thread relations cannot be changed
135
+ # because this will update the thread file format.
136
+ self._update_threads(post)
137
+
138
+ print(proggress_symbol, end='', flush=True)
139
+
140
+ # Newline after progress dots
141
+ print()
142
+
143
+ threads_json = {root_id: list(post_ids)
144
+ for root_id, post_ids in self._threads.items()}
145
+ dump.dump_content(self.channels_dir, threads_json,
146
+ id_=self.channel['id'], name=self._threads_filename)
147
+
148
+ return num_posts, num_files
matterbak/dump.py ADDED
@@ -0,0 +1,129 @@
1
+ """
2
+ Provide functions to dump data into JSON files
3
+ """
4
+
5
+
6
+ import datetime
7
+ import json
8
+ import pathlib as pl
9
+
10
+ from .ignoresignals import IgnoreSignals
11
+
12
+ JSON_EXTENSION = '.json'
13
+ # Separator between parts of a filename
14
+ FILENAME_SEPARATOR = '__'
15
+ # Format for timestamps in file names
16
+ TIMESTAMP_FORMAT = "%Y%m%d-%H%M%S%f"
17
+
18
+ # Subdirs below data_dir to store the related downloads
19
+ teams_subdir = pl.Path('teams')
20
+ groups_subdir = pl.Path('groups')
21
+ direct_subdir = pl.Path('direct')
22
+ emojis_subdir = pl.Path('emojis')
23
+ users_subdir = pl.Path('users')
24
+ files_subdir = pl.Path('files')
25
+
26
+ # Suffixes for types of data files
27
+ SUFFIX_MEMBERS = 'members'
28
+ SUFFIX_THREADS = 'threads'
29
+ SUFFIX_ICON = 'icon'
30
+ SUFFIX_IMAGE = 'image'
31
+
32
+
33
+ def make_filename(id_, name=None, extension='', mm_timestamp=None):
34
+ """Make a filename for a backup file
35
+
36
+ id_: Mattermost ID to insert into the filename
37
+ name: optional name to append
38
+ extension: optional extension for the filename
39
+ mm_timestamp: optional Mattermost timestamp (Unix time in milliseconds)
40
+
41
+ return: filename
42
+ """
43
+ filename_parts = []
44
+ if mm_timestamp:
45
+ now = datetime.datetime.fromtimestamp(mm_timestamp / 1000)
46
+ filename_parts.append(now.strftime(TIMESTAMP_FORMAT))
47
+ filename_parts.append(id_)
48
+ if name:
49
+ filename_parts.append(name)
50
+
51
+ return FILENAME_SEPARATOR.join(filename_parts) + extension
52
+
53
+
54
+ def dump_image(directory, id_, image_loader, label=None, skip_existing=False):
55
+ """Helper to download and save an image from Mattermost
56
+
57
+ Calls make_filename with id_, label as name, and extension derived from the
58
+ content type returned from Mattermost.
59
+
60
+ directory: pathlib.Path of the folder to store the image in
61
+ id_: Mattermost ID as prefix for the filename
62
+ image_loader: function returning an image Response object from Mattermost API
63
+ label: label to append to filename
64
+ skip_existing: if True skip download if image file already exists
65
+ """
66
+
67
+ found_image_files = [f for f in directory.glob(id_+'*')
68
+ if f.suffix != JSON_EXTENSION and f.is_file()]
69
+ if skip_existing and found_image_files:
70
+ return
71
+
72
+ # The new image file may have a different extension so delete all existing
73
+ # image files.
74
+ for image in found_image_files:
75
+ image.unlink(missing_ok=True)
76
+
77
+ response = image_loader()
78
+ if not response.ok:
79
+ return
80
+
81
+ content_type_prefix = 'image/'
82
+ content_type = response.headers.get('content-type', '')
83
+ if not content_type.startswith(content_type_prefix):
84
+ print(f"Cannot store image of type '{content_type}' for ID {id_}")
85
+ return
86
+ extension = '.' + content_type.removeprefix(content_type_prefix)
87
+
88
+ with IgnoreSignals():
89
+ path = (directory /
90
+ make_filename(id_=id_, name=label, extension=extension))
91
+ path.write_bytes(response.content)
92
+
93
+
94
+ def dump_content(directory, content, id_=None, name=None, with_timestamp=False,
95
+ return_old_content=False):
96
+ # pylint: disable = too-many-arguments, too-many-positional-arguments
97
+ """Helper to save the content as JSON file
98
+
99
+ Calls make_filename with id_ (if given else content['id']), name, and
100
+ with_timestamp to create the filename.
101
+
102
+ directory: pathlib.Path of the folder to store the file in
103
+ content: data to store
104
+ id_: Mattermost ID to be integrated into filename, if None use
105
+ content['id'] instead
106
+ name: name (without .json extension) of the file, can be empty
107
+ with_timestamp: set to True to prefix filename with content's creation time
108
+ return_old_content: if True content of file to be overwritten is returned
109
+ or None if there was no content file
110
+ """
111
+
112
+ if not id_:
113
+ id_ = content['id']
114
+ mm_timestamp = content["create_at"] if with_timestamp else None
115
+
116
+ path = (directory /
117
+ make_filename(id_, name=name, extension=JSON_EXTENSION,
118
+ mm_timestamp=mm_timestamp))
119
+
120
+ old_content = None
121
+ if return_old_content and path.is_file():
122
+ with path.open(encoding="utf8") as old_file:
123
+ old_content = json.load(old_file)
124
+
125
+ with IgnoreSignals():
126
+ with path.open(mode="w", encoding="utf8") as dump_file:
127
+ json.dump(content, dump_file)
128
+
129
+ return old_content
@@ -0,0 +1,16 @@
1
+ """
2
+ Provide class HashableMatterData
3
+ """
4
+
5
+
6
+ class HashableMatterData(dict):
7
+ """Extends the dict of a mattermost object by hash method to enable storing in a set
8
+
9
+ This class can be initialized with the original dict of the mattermost object.
10
+ """
11
+
12
+ def __eq__(self, other):
13
+ return self['id'] == other['id']
14
+
15
+ def __hash__(self):
16
+ return hash(self['id'])
@@ -0,0 +1,106 @@
1
+ """
2
+ A simple class to temporarily ignore specific signals (e.g., SIGINT, SIGTERM)
3
+ during critical operations like file writing.
4
+ """
5
+
6
+ import os
7
+ import signal
8
+
9
+
10
+ class IgnoreSignals():
11
+ """Context manager to temporarily ignore specified signals"""
12
+ def __init__(
13
+ self,
14
+ signals=None,
15
+ print_message_on_signal=None,
16
+ delay_signals=True):
17
+ """
18
+ Temporarily ignore specified signals (e.g., Ctrl+C, kill) during
19
+ critical operations.
20
+
21
+ Example:
22
+
23
+ >>> with IgnoreSignals([signal.SIGINT, signal.SIGTERM]):
24
+ ... # do critical work like file writing
25
+
26
+ signals (list): List of signal numbers to ignore
27
+ (default: [signal.SIGINT, signal.SIGTERM]).
28
+ Use `None` to accept the default.
29
+ print_message_on_signal (str or callable or None):
30
+ message to be printed on signal
31
+ * If None (default) the f-string
32
+ f'ignoring signal {signum} until write is finished'
33
+ is used.
34
+ * If a callable (e.g. lambda function):
35
+ called with (signum, frame). This function could print other
36
+ messages build with input.
37
+ * If bool(print_message_on_signal) is True
38
+ the variable print_message_on_signal is printed.
39
+ * Otherwise (e. g. False or '') no output.
40
+ delay_signals:
41
+ If True the default handler will be called just after reverting
42
+ Only the last ignored signal is re-raised.
43
+ Attempting to re-raise multiple signals would risk unpredictable
44
+ behavior and is intentionally omitted.
45
+
46
+ return: dict with actual handlers
47
+ """
48
+ if signals is None:
49
+ signals = [signal.SIGINT, signal.SIGTERM]
50
+ self.signals = signals
51
+ self.print_message_on_signal = print_message_on_signal
52
+ self.default_handlers = []
53
+ self.update_default_handlers()
54
+ self.ignored_signum = None
55
+ self.delay_signals = delay_signals
56
+
57
+ def __enter__(self):
58
+ """Enter 'with' context"""
59
+ self.ignore()
60
+ return self
61
+
62
+ def __exit__(self, exc_type, exc_val, exc_tb):
63
+ """Leave 'with' context"""
64
+ self.revert()
65
+
66
+ def update_default_handlers(self):
67
+ """
68
+ store actual signal handlers as default handlers
69
+ """
70
+ self.default_handlers = []
71
+ for sig in self.signals:
72
+ self.default_handlers.append([sig, signal.getsignal(sig)])
73
+
74
+ def ignore(self):
75
+ """
76
+ set signal handlers to ignore the signals
77
+ """
78
+ for sig in self.signals:
79
+ signal.signal(sig, self.ignoring_handler)
80
+
81
+ def revert(self):
82
+ """
83
+ revert/restore to original/default signal handlers
84
+
85
+ More precisely the signal handlers defined during
86
+ class instance creation
87
+ or calling the method `update_default_handlers`
88
+ """
89
+ for (sig, handler) in self.default_handlers:
90
+ signal.signal(sig, handler)
91
+ if self.delay_signals and (self.ignored_signum is not None):
92
+ os.kill(os.getpid(), self.ignored_signum)
93
+ self.ignored_signum = None
94
+
95
+ def ignoring_handler(self, signum, frame):
96
+ """
97
+ handler that ignores the signal and optionally prints a message
98
+ or calls a function
99
+ """
100
+ self.ignored_signum = signum
101
+ if self.print_message_on_signal is None:
102
+ print(f'ignoring signal {signum} until write is finished')
103
+ if callable(self.print_message_on_signal):
104
+ self.print_message_on_signal(signum, frame)
105
+ elif self.print_message_on_signal:
106
+ print(self.print_message_on_signal)