redacted-py 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ Copyright (c) 2024 Cyril Dever
2
+
3
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4
+
5
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
7
+
8
+ Subject to the terms and conditions of this license, each copyright holder and contributor hereby grants to those receiving rights under this license a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this software, where such license applies only to those patent claims, already acquired or hereafter acquired, licensable by such copyright holder or contributor that are necessarily infringed by:
9
+
10
+ (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable additions of contributors, in source or binary form) alone; or
11
+ (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s) was added by such copyright holder or contributor, if, at the time the Contribution is added, such addition causes such combination to be necessarily infringed. The patent license shall not apply to any other combinations which include the Contribution.
12
+
13
+ Except as expressly stated above, no rights or licenses from any copyright holder or contributor is granted under this license, whether expressly, by implication, estoppel or otherwise.
14
+
15
+ DISCLAIMER
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,3 @@
1
+ # MANIFEST.in
2
+
3
+ exclude publish.sh
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.1
2
+ Name: redacted-py
3
+ Version: 1.0.4
4
+ Summary: Redacting classified documents
5
+ Author-email: Cyril Dever <cdever@pep-s.com>
6
+ License: Copyright (c) 2024 Cyril Dever
7
+
8
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
9
+
10
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
11
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
12
+
13
+ Subject to the terms and conditions of this license, each copyright holder and contributor hereby grants to those receiving rights under this license a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this software, where such license applies only to those patent claims, already acquired or hereafter acquired, licensable by such copyright holder or contributor that are necessarily infringed by:
14
+
15
+ (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable additions of contributors, in source or binary form) alone; or
16
+ (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s) was added by such copyright holder or contributor, if, at the time the Contribution is added, such addition causes such combination to be necessarily infringed. The patent license shall not apply to any other combinations which include the Contribution.
17
+
18
+ Except as expressly stated above, no rights or licenses from any copyright holder or contributor is granted under this license, whether expressly, by implication, estoppel or otherwise.
19
+
20
+ DISCLAIMER
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23
+
24
+ Project-URL: Homepage, https://github.com/cyrildever/redacted
25
+ Keywords: data,obfuscation,data masking,redacted,classified
26
+ Classifier: Programming Language :: Python :: 3
27
+ Classifier: License :: OSI Approved :: MIT License
28
+ Classifier: Operating System :: OS Independent
29
+ Requires-Python: >=3.10.2
30
+ Description-Content-Type: text/markdown
31
+ License-File: LICENSE
32
+ Requires-Dist: feistel-py>=0.2.0
33
+
34
+ # redacted-py
35
+ _Redacting classified documents_
36
+
37
+ ![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/cyrildever/redacted)
38
+ ![GitHub last commit](https://img.shields.io/github/last-commit/cyrildever/redacted)
39
+ ![GitHub issues](https://img.shields.io/github/issues/cyrildever/redacted)
40
+ ![GitHub license](https://img.shields.io/github/license/cyrildever/redacted)
41
+ ![PyPI - Version](https://img.shields.io/pypi/v/redacted-py)
42
+
43
+ This repository holds the code base for my `redacted-py` library in Python. \
44
+ It is mainly based off my [Feistel cipher for Format-Preserving Encryption](https://github.com/cyrildever/feistel-py) to which I added a few tools to handle document, database and file manipulation to ease out the operation.
45
+
46
+ ### Motivation
47
+
48
+ In some fields (like healthcare for instance), protecting the privacy of data whilst being able to conduct in-depth studies is both vital and mandatory. Redacting documents and databases is therefore the obligatory passage.
49
+ With `redacted-py`, I provide a simple yet secure tool to help redacting documents based on either a dictionary, a record layout or a tag to decide which parts should actually be redacted.
50
+
51
+
52
+ ### Usage
53
+
54
+ You can use either a dictionary or a tag (or both) to identify the words you want to redact in a document.
55
+ The tag should be placed before any word that should be redacted. The default tag is the tilde character (`~`).
56
+
57
+ For example, the following sentence will only see the word `tagged` redacted: `"This is a ~tagged sentence"`.
58
+
59
+ ```console
60
+ $ pip install redacted-py
61
+ ```
62
+
63
+ ```python
64
+ from redacted import DefaultRedactor, Dictionary
65
+ from feistel import FPECipher, SHA_256
66
+
67
+ source = "Some text ~tagged or using words in a dictionary"
68
+
69
+ cipher = FPECipher(SHA_256, key, 10)
70
+ redactor = DefaultRedactor(cipher)
71
+ redacted = redactor.redact(source)
72
+
73
+ expanded = redactor.expand(redacted)
74
+ assert expanded == source, "Original data should equal ciphered then deciphered data"
75
+
76
+ cleansed = redactor.clean(expanded)
77
+ assert cleansed == "Some text tagged or using words in a dictionary", "Cleaning should remove any tag mark"
78
+ ```
79
+
80
+ You may also use it in the console with the following command line instructions:
81
+ ```
82
+ usage: python3 -m redacted [-h] [-b | --both | --no-both] [-d DICTIONARY] [-H HASH] [-i INPUT] [-k KEY] [-o OUTPUT] [-r ROUNDS] [-t TAG] [-x | --expand | --no-expand]
83
+
84
+ options:
85
+ -h, --help show this help message and exit
86
+ -b, --both, --no-both
87
+ Add to use both dictionary and tag
88
+ -d DICTIONARY, --dictionary DICTIONARY
89
+ The optional path to the dictionary of words to redact
90
+ -H HASH, --hash HASH The hash engine for the round function [default sha-256]
91
+ -i INPUT, --input INPUT
92
+ The path to the document to be redacted
93
+ -k KEY, --key KEY The optional key for the FPE scheme (leave it empty to use default)
94
+ -o OUTPUT, --output OUTPUT
95
+ The name of the output file
96
+ -r ROUNDS, --rounds ROUNDS
97
+ The number of rounds for the Feistel cipher [default 10]
98
+ -t TAG, --tag TAG The optional tag that prefixes words to redact [default ~]
99
+ -x, --expand, --no-expand
100
+ Add to expand a redacted document
101
+ ```
102
+
103
+
104
+ ### Tests
105
+
106
+ ```console
107
+ $ git clone https://github.com/cyrildever/redacted.git
108
+ $ cd redacted/py/
109
+ $ pip install -e .
110
+ $ python3 -m unittest discover
111
+ ```
112
+
113
+
114
+ ### License
115
+
116
+ The use of the `redacted` libraries and executables are subject to fees for commercial purpose and to the respect of the [BSD-2-Clause-Patent license](LICENSE). \
117
+ Please [contact me](mailto:cdever@pep-s.com) to get further information.
118
+
119
+ _NB: It is still under development so use in production at your own risk for now._
120
+
121
+
122
+ <hr />
123
+ &copy; 2024 Cyril Dever. All rights reserved.
@@ -0,0 +1,90 @@
1
+ # redacted-py
2
+ _Redacting classified documents_
3
+
4
+ ![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/cyrildever/redacted)
5
+ ![GitHub last commit](https://img.shields.io/github/last-commit/cyrildever/redacted)
6
+ ![GitHub issues](https://img.shields.io/github/issues/cyrildever/redacted)
7
+ ![GitHub license](https://img.shields.io/github/license/cyrildever/redacted)
8
+ ![PyPI - Version](https://img.shields.io/pypi/v/redacted-py)
9
+
10
+ This repository holds the code base for my `redacted-py` library in Python. \
11
+ It is mainly based off my [Feistel cipher for Format-Preserving Encryption](https://github.com/cyrildever/feistel-py) to which I added a few tools to handle document, database and file manipulation to ease out the operation.
12
+
13
+ ### Motivation
14
+
15
+ In some fields (like healthcare for instance), protecting the privacy of data whilst being able to conduct in-depth studies is both vital and mandatory. Redacting documents and databases is therefore the obligatory passage.
16
+ With `redacted-py`, I provide a simple yet secure tool to help redacting documents based on either a dictionary, a record layout or a tag to decide which parts should actually be redacted.
17
+
18
+
19
+ ### Usage
20
+
21
+ You can use either a dictionary or a tag (or both) to identify the words you want to redact in a document.
22
+ The tag should be placed before any word that should be redacted. The default tag is the tilde character (`~`).
23
+
24
+ For example, the following sentence will only see the word `tagged` redacted: `"This is a ~tagged sentence"`.
25
+
26
+ ```console
27
+ $ pip install redacted-py
28
+ ```
29
+
30
+ ```python
31
+ from redacted import DefaultRedactor, Dictionary
32
+ from feistel import FPECipher, SHA_256
33
+
34
+ source = "Some text ~tagged or using words in a dictionary"
35
+
36
+ cipher = FPECipher(SHA_256, key, 10)
37
+ redactor = DefaultRedactor(cipher)
38
+ redacted = redactor.redact(source)
39
+
40
+ expanded = redactor.expand(redacted)
41
+ assert expanded == source, "Original data should equal ciphered then deciphered data"
42
+
43
+ cleansed = redactor.clean(expanded)
44
+ assert cleansed == "Some text tagged or using words in a dictionary", "Cleaning should remove any tag mark"
45
+ ```
46
+
47
+ You may also use it in the console with the following command line instructions:
48
+ ```
49
+ usage: python3 -m redacted [-h] [-b | --both | --no-both] [-d DICTIONARY] [-H HASH] [-i INPUT] [-k KEY] [-o OUTPUT] [-r ROUNDS] [-t TAG] [-x | --expand | --no-expand]
50
+
51
+ options:
52
+ -h, --help show this help message and exit
53
+ -b, --both, --no-both
54
+ Add to use both dictionary and tag
55
+ -d DICTIONARY, --dictionary DICTIONARY
56
+ The optional path to the dictionary of words to redact
57
+ -H HASH, --hash HASH The hash engine for the round function [default sha-256]
58
+ -i INPUT, --input INPUT
59
+ The path to the document to be redacted
60
+ -k KEY, --key KEY The optional key for the FPE scheme (leave it empty to use default)
61
+ -o OUTPUT, --output OUTPUT
62
+ The name of the output file
63
+ -r ROUNDS, --rounds ROUNDS
64
+ The number of rounds for the Feistel cipher [default 10]
65
+ -t TAG, --tag TAG The optional tag that prefixes words to redact [default ~]
66
+ -x, --expand, --no-expand
67
+ Add to expand a redacted document
68
+ ```
69
+
70
+
71
+ ### Tests
72
+
73
+ ```console
74
+ $ git clone https://github.com/cyrildever/redacted.git
75
+ $ cd redacted/py/
76
+ $ pip install -e .
77
+ $ python3 -m unittest discover
78
+ ```
79
+
80
+
81
+ ### License
82
+
83
+ The use of the `redacted` libraries and executables are subject to fees for commercial purpose and to the respect of the [BSD-2-Clause-Patent license](LICENSE). \
84
+ Please [contact me](mailto:cdever@pep-s.com) to get further information.
85
+
86
+ _NB: It is still under development so use in production at your own risk for now._
87
+
88
+
89
+ <hr />
90
+ &copy; 2024 Cyril Dever. All rights reserved.
@@ -0,0 +1,35 @@
1
+ # pyproject.toml
2
+
3
+ [build-system]
4
+ requires = ["setuptools>=58.1.0", "wheel"]
5
+ build-backend = "setuptools.build_meta"
6
+
7
+ [project]
8
+ name = "redacted-py"
9
+ version = "1.0.4"
10
+ description = "Redacting classified documents"
11
+ readme = "README.md"
12
+ authors = [{ name = "Cyril Dever", email = "cdever@pep-s.com" }]
13
+ license = { file = "LICENSE" }
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+ keywords = [
20
+ "data",
21
+ "obfuscation",
22
+ "data masking",
23
+ "redacted",
24
+ "classified",
25
+ ]
26
+ dependencies = [
27
+ "feistel-py >= 0.2.0",
28
+ ]
29
+ requires-python = ">=3.10.2"
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/cyrildever/redacted"
33
+
34
+ [project.scripts]
35
+ redacted-py = "redacted.__main__:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,7 @@
1
+ # __init__.py
2
+
3
+ __version__ = "1.0.4"
4
+
5
+ from .tag import *
6
+ from .dictionary import *
7
+ from .redactor import *
@@ -0,0 +1,114 @@
1
+ import argparse
2
+ from feistel import FPECipher, Engine, is_available_engine, SHA_256
3
+
4
+
5
+ from redacted import (
6
+ DEFAULT_TAG,
7
+ file2Dictionary,
8
+ Redactor,
9
+ RedactorWithDictionary,
10
+ RedactorWithTag,
11
+ )
12
+
13
+
14
+ DEFAULT_KEY = "d51e1d9a9b12cd88a1d232c1b8730a05c8a65d9706f30cdb8e08b9ed4c7b16a0"
15
+ DEFAULT_ROUNDS = 10
16
+
17
+
18
+ def main(args):
19
+ if not args.input or not args.output:
20
+ raise Exception("Input and output file paths are mandatory")
21
+ if not args.tag and not args.dictionary:
22
+ raise Exception("Use to set either a tag or a dictionary")
23
+ tag = args.tag
24
+ if args.both:
25
+ if not args.dictionary:
26
+ raise Exception(
27
+ "Tag and dictionary must be set if you want to use them both"
28
+ )
29
+ elif not args.tag:
30
+ print("WARN - Tag not set: default ~ will be used!")
31
+ tag = DEFAULT_TAG
32
+
33
+ hash_engine = Engine(args.hash)
34
+ if not args.hash and not is_available_engine(hash_engine):
35
+ print("WARN - Wrong hash engine: default SHA-256 will be used instead!")
36
+ hash_engine = SHA_256
37
+ key = args.key
38
+ if not key:
39
+ key = DEFAULT_KEY
40
+ rounds = int(args.rounds) if args.rounds else 0
41
+ if rounds < 2:
42
+ print("WARN - Not enough rounds: default 10 will be used instead!")
43
+ rounds = DEFAULT_ROUNDS
44
+
45
+ msg = "Start redacting..."
46
+ if args.expand:
47
+ msg = "Start expanding..."
48
+ print(f"INFO - {msg}")
49
+
50
+ # Prepare processing
51
+ if args.dictionary:
52
+ dic = file2Dictionary(args.dictionary)
53
+
54
+ cipher = FPECipher(hash_engine, key, rounds)
55
+ if args.both:
56
+ redactor = Redactor(dictionary=dic, tag=tag, cipher=cipher, both=True)
57
+ elif not args.expand and not dic.is_empty():
58
+ redactor = RedactorWithDictionary(dictionary=dic, cipher=cipher)
59
+ else:
60
+ redactor = RedactorWithTag(tag=tag, cipher=cipher)
61
+
62
+ # Do process
63
+ with open(args.input, "r") as inputfile, open(args.output, "w") as outputfile:
64
+ for line in inputfile:
65
+ if not args.expand:
66
+ redacted_line = redactor.redact(line)
67
+ outputfile.write(redacted_line + "\n")
68
+ else:
69
+ expanded_line = redactor.expand(line)
70
+ outputfile.write(expanded_line + "\n")
71
+
72
+ print("INFO - Process completed.")
73
+
74
+
75
+ if __name__ == "__main__":
76
+ parser = argparse.ArgumentParser()
77
+ parser.add_argument(
78
+ "-b",
79
+ "--both",
80
+ action=argparse.BooleanOptionalAction,
81
+ help="Add to use both dictionary and tag",
82
+ )
83
+ parser.add_argument(
84
+ "-d",
85
+ "--dictionary",
86
+ help="The optional path to the dictionary of words to redact",
87
+ )
88
+ parser.add_argument(
89
+ "-H", "--hash", help="The hash engine for the round function [default sha-256]"
90
+ )
91
+ parser.add_argument("-i", "--input", help="The path to the document to be redacted")
92
+ parser.add_argument(
93
+ "-k",
94
+ "--key",
95
+ help="The optional key for the FPE scheme (leave it empty to use default)",
96
+ )
97
+ parser.add_argument("-o", "--output", help="The name of the output file")
98
+ parser.add_argument(
99
+ "-r",
100
+ "--rounds",
101
+ help="The number of rounds for the Feistel cipher [default 10]",
102
+ )
103
+ parser.add_argument(
104
+ "-t", "--tag", help="The optional tag that prefixes words to redact [default ~]"
105
+ )
106
+ parser.add_argument(
107
+ "-x",
108
+ "--expand",
109
+ action=argparse.BooleanOptionalAction,
110
+ help="Add to expand a redacted document",
111
+ )
112
+ args = parser.parse_args()
113
+
114
+ main(args)
@@ -0,0 +1,83 @@
1
+ class Dictionary:
2
+ """
3
+ A Dictionary holds the list of words to be tagged
4
+ """
5
+
6
+ def __init__(self, words: list[str]):
7
+ self.words = words
8
+
9
+ def contains(self, word: str) -> bool:
10
+ """
11
+ Check whether the passed word is already in the Dictionary
12
+ """
13
+ return (
14
+ word in self.words
15
+ or (word.endswith("'s") and word[: len(word) - 2]) in self.words
16
+ or _remove_punctuation(word.strip()) in self.words
17
+ )
18
+
19
+ def is_empty(self) -> bool:
20
+ """
21
+ Returns `True` if there is no word in the Dictionary
22
+ """
23
+ return len(self.words) == 0
24
+
25
+ def length(self) -> int:
26
+ """
27
+ Gets the number of words
28
+ """
29
+ return len(self.words)
30
+
31
+ def to_string(self) -> str:
32
+ """
33
+ Returns the dictionay as a space-separated list of words
34
+ """
35
+ return " ".join(self.words)
36
+
37
+ def __eq__(self, other) -> bool:
38
+ return self.words == other.words
39
+
40
+
41
+ def file2Dictionary(path: str) -> Dictionary:
42
+ """
43
+ Upload the content of a file to a Dictionary
44
+ """
45
+ with open(path, "r") as f:
46
+ data = f.read()
47
+
48
+ return string2Dictionary(data)
49
+
50
+
51
+ def string2Dictionary(string: str, *delimiters) -> Dictionary:
52
+ """
53
+ Transforms a string into a dictionary (using the optionally passed delimiters [default space])
54
+ """
55
+ if len(delimiters) == 0:
56
+ delimiters = [" "]
57
+
58
+ words = [string]
59
+ for delimiter in delimiters:
60
+ tmp = list[str]()
61
+ for word in words:
62
+ tmp.extend(word.split(delimiter))
63
+ words = tmp
64
+
65
+ return Dictionary(words)
66
+
67
+
68
+ def _remove_punctuation(string: str) -> str:
69
+ return (
70
+ string.strip(".")
71
+ .strip(",")
72
+ .strip(":")
73
+ .strip(";")
74
+ .strip("?")
75
+ .strip("!")
76
+ .strip("(")
77
+ .strip(")")
78
+ .strip("-")
79
+ .strip("_")
80
+ .strip("+")
81
+ .strip("/")
82
+ .strip("\\")
83
+ )
@@ -0,0 +1,116 @@
1
+ import re
2
+ from feistel import FPECipher
3
+
4
+
5
+ from redacted import Dictionary, DEFAULT_TAG
6
+
7
+
8
+ class Redactor:
9
+ """
10
+ Parent class for redactors
11
+ """
12
+
13
+ def __init__(self, dictionary: Dictionary, tag: str, cipher: FPECipher, both: bool):
14
+ self.dictionary = dictionary
15
+ self.tag = tag
16
+ self.cipher = cipher
17
+ self.both = both
18
+
19
+ def redact(self, line: str, *delimiters) -> str:
20
+ """
21
+ Returns the ciphered version of the passed input data
22
+ """
23
+ actual_delimiters = "\\s" + "".join(
24
+ [
25
+ (
26
+ delim
27
+ if delim != " "
28
+ and delim != "\t"
29
+ and delim != "\n"
30
+ and delim != "\\s"
31
+ else ""
32
+ )
33
+ for delim in delimiters
34
+ ]
35
+ )
36
+ words = re.split(actual_delimiters, line)
37
+ tokens = list[str]()
38
+ for word in words:
39
+ if self.both or not self.dictionary.is_empty():
40
+ if self.dictionary.contains(word):
41
+ tokens.append(self.cipher.encrypt(word))
42
+ else:
43
+ if self.tag and word.startswith(self.tag):
44
+ tokens.append(
45
+ self.tag + self.cipher.encrypt(word[len(self.tag) :])
46
+ )
47
+ else:
48
+ tokens.append(word)
49
+ else:
50
+ if (
51
+ not self.both
52
+ and self.dictionary.is_empty()
53
+ and self.tag
54
+ and word.startswith(self.tag)
55
+ ):
56
+ tokens.append(self.tag + self.cipher.encrypt(word[len(self.tag) :]))
57
+ else:
58
+ tokens.append(word)
59
+
60
+ return " ".join(tokens)
61
+
62
+ def expand(self, line: str) -> str:
63
+ """
64
+ Returns the deciphered version of the passed input data
65
+ """
66
+ words = list[str]()
67
+ for word in line.split(" "):
68
+ if self.both or not self.dictionary.is_empty():
69
+ if self.tag and word.startswith(self.tag):
70
+ words.append(self.tag + self.cipher.decrypt(word[len(self.tag) :]))
71
+ else:
72
+ if not word.strip():
73
+ # Avoid using new lines
74
+ continue
75
+ else:
76
+ deciphered = self.cipher.decrypt(word)
77
+ if deciphered and self.dictionary.contains(deciphered):
78
+ words.append(deciphered)
79
+ else:
80
+ words.append(word)
81
+ else:
82
+ if not self.both and self.dictionary.is_empty():
83
+ if self.tag and word.startswith(self.tag):
84
+ words.append(
85
+ self.tag + self.cipher.decrypt(word[len(self.tag) :])
86
+ )
87
+ else:
88
+ words.append(word)
89
+ else:
90
+ words.append(word)
91
+
92
+ return " ".join(words)
93
+
94
+ def clean(self, string: str) -> str:
95
+ """
96
+ Removes any delimiters from the passed string
97
+ """
98
+ if self.tag:
99
+ return string.replace(self.tag, "")
100
+ else:
101
+ return string
102
+
103
+
104
+ class DefaultRedactor(Redactor):
105
+ def __init__(self, cipher: FPECipher):
106
+ super().__init__(Dictionary([]), DEFAULT_TAG, cipher, False)
107
+
108
+
109
+ class RedactorWithDictionary(Redactor):
110
+ def __init__(self, dictionary: Dictionary, cipher: FPECipher):
111
+ super().__init__(dictionary, "", cipher, False)
112
+
113
+
114
+ class RedactorWithTag(Redactor):
115
+ def __init__(self, tag: str, cipher: FPECipher):
116
+ super().__init__(Dictionary([]), tag, cipher, False)
@@ -0,0 +1 @@
1
+ DEFAULT_TAG = "~"
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.1
2
+ Name: redacted-py
3
+ Version: 1.0.4
4
+ Summary: Redacting classified documents
5
+ Author-email: Cyril Dever <cdever@pep-s.com>
6
+ License: Copyright (c) 2024 Cyril Dever
7
+
8
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
9
+
10
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
11
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
12
+
13
+ Subject to the terms and conditions of this license, each copyright holder and contributor hereby grants to those receiving rights under this license a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this software, where such license applies only to those patent claims, already acquired or hereafter acquired, licensable by such copyright holder or contributor that are necessarily infringed by:
14
+
15
+ (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable additions of contributors, in source or binary form) alone; or
16
+ (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s) was added by such copyright holder or contributor, if, at the time the Contribution is added, such addition causes such combination to be necessarily infringed. The patent license shall not apply to any other combinations which include the Contribution.
17
+
18
+ Except as expressly stated above, no rights or licenses from any copyright holder or contributor is granted under this license, whether expressly, by implication, estoppel or otherwise.
19
+
20
+ DISCLAIMER
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23
+
24
+ Project-URL: Homepage, https://github.com/cyrildever/redacted
25
+ Keywords: data,obfuscation,data masking,redacted,classified
26
+ Classifier: Programming Language :: Python :: 3
27
+ Classifier: License :: OSI Approved :: MIT License
28
+ Classifier: Operating System :: OS Independent
29
+ Requires-Python: >=3.10.2
30
+ Description-Content-Type: text/markdown
31
+ License-File: LICENSE
32
+ Requires-Dist: feistel-py>=0.2.0
33
+
34
+ # redacted-py
35
+ _Redacting classified documents_
36
+
37
+ ![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/cyrildever/redacted)
38
+ ![GitHub last commit](https://img.shields.io/github/last-commit/cyrildever/redacted)
39
+ ![GitHub issues](https://img.shields.io/github/issues/cyrildever/redacted)
40
+ ![GitHub license](https://img.shields.io/github/license/cyrildever/redacted)
41
+ ![PyPI - Version](https://img.shields.io/pypi/v/redacted-py)
42
+
43
+ This repository holds the code base for my `redacted-py` library in Python. \
44
+ It is mainly based off my [Feistel cipher for Format-Preserving Encryption](https://github.com/cyrildever/feistel-py) to which I added a few tools to handle document, database and file manipulation to ease out the operation.
45
+
46
+ ### Motivation
47
+
48
+ In some fields (like healthcare for instance), protecting the privacy of data whilst being able to conduct in-depth studies is both vital and mandatory. Redacting documents and databases is therefore the obligatory passage.
49
+ With `redacted-py`, I provide a simple yet secure tool to help redacting documents based on either a dictionary, a record layout or a tag to decide which parts should actually be redacted.
50
+
51
+
52
+ ### Usage
53
+
54
+ You can use either a dictionary or a tag (or both) to identify the words you want to redact in a document.
55
+ The tag should be placed before any word that should be redacted. The default tag is the tilde character (`~`).
56
+
57
+ For example, the following sentence will only see the word `tagged` redacted: `"This is a ~tagged sentence"`.
58
+
59
+ ```console
60
+ $ pip install redacted-py
61
+ ```
62
+
63
+ ```python
64
+ from redacted import DefaultRedactor, Dictionary
65
+ from feistel import FPECipher, SHA_256
66
+
67
+ source = "Some text ~tagged or using words in a dictionary"
68
+
69
+ cipher = FPECipher(SHA_256, key, 10)
70
+ redactor = DefaultRedactor(cipher)
71
+ redacted = redactor.redact(source)
72
+
73
+ expanded = redactor.expand(redacted)
74
+ assert expanded == source, "Original data should equal ciphered then deciphered data"
75
+
76
+ cleansed = redactor.clean(expanded)
77
+ assert cleansed == "Some text tagged or using words in a dictionary", "Cleaning should remove any tag mark"
78
+ ```
79
+
80
+ You may also use it in the console with the following command line instructions:
81
+ ```
82
+ usage: python3 -m redacted [-h] [-b | --both | --no-both] [-d DICTIONARY] [-H HASH] [-i INPUT] [-k KEY] [-o OUTPUT] [-r ROUNDS] [-t TAG] [-x | --expand | --no-expand]
83
+
84
+ options:
85
+ -h, --help show this help message and exit
86
+ -b, --both, --no-both
87
+ Add to use both dictionary and tag
88
+ -d DICTIONARY, --dictionary DICTIONARY
89
+ The optional path to the dictionary of words to redact
90
+ -H HASH, --hash HASH The hash engine for the round function [default sha-256]
91
+ -i INPUT, --input INPUT
92
+ The path to the document to be redacted
93
+ -k KEY, --key KEY The optional key for the FPE scheme (leave it empty to use default)
94
+ -o OUTPUT, --output OUTPUT
95
+ The name of the output file
96
+ -r ROUNDS, --rounds ROUNDS
97
+ The number of rounds for the Feistel cipher [default 10]
98
+ -t TAG, --tag TAG The optional tag that prefixes words to redact [default ~]
99
+ -x, --expand, --no-expand
100
+ Add to expand a redacted document
101
+ ```
102
+
103
+
104
+ ### Tests
105
+
106
+ ```console
107
+ $ git clone https://github.com/cyrildever/redacted.git
108
+ $ cd redacted/py/
109
+ $ pip install -e .
110
+ $ python3 -m unittest discover
111
+ ```
112
+
113
+
114
+ ### License
115
+
116
+ The use of the `redacted` libraries and executables are subject to fees for commercial purpose and to the respect of the [BSD-2-Clause-Patent license](LICENSE). \
117
+ Please [contact me](mailto:cdever@pep-s.com) to get further information.
118
+
119
+ _NB: It is still under development so use in production at your own risk for now._
120
+
121
+
122
+ <hr />
123
+ &copy; 2024 Cyril Dever. All rights reserved.
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ src/redacted/__init__.py
6
+ src/redacted/__main__.py
7
+ src/redacted/dictionary.py
8
+ src/redacted/redactor.py
9
+ src/redacted/tag.py
10
+ src/redacted_py.egg-info/PKG-INFO
11
+ src/redacted_py.egg-info/SOURCES.txt
12
+ src/redacted_py.egg-info/dependency_links.txt
13
+ src/redacted_py.egg-info/entry_points.txt
14
+ src/redacted_py.egg-info/requires.txt
15
+ src/redacted_py.egg-info/top_level.txt
16
+ tests/test_dictionary.py
17
+ tests/test_redactor.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ redacted-py = redacted.__main__:main
@@ -0,0 +1 @@
1
+ feistel-py>=0.2.0
@@ -0,0 +1,23 @@
1
+ from unittest import TestCase
2
+
3
+ from redacted import Dictionary, file2Dictionary, string2Dictionary
4
+
5
+
6
+ class TestDictionary(TestCase):
7
+ def test_string2Dictionary(self):
8
+ string = "Cyril Antoine Laurent,Dever"
9
+ dic = string2Dictionary(string)
10
+ self.assertEqual(dic.length(), 3)
11
+ self.assertFalse(dic.contains("Dever"))
12
+ dic = string2Dictionary(string, " ", ",")
13
+ self.assertEqual(dic.length(), 4)
14
+ self.assertTrue(dic.contains("Dever"))
15
+ self.assertFalse(dic.is_empty())
16
+ self.assertEqual(dic.to_string(), "Cyril Antoine Laurent Dever")
17
+
18
+ dic2 = Dictionary(["Cyril", "Antoine", "Laurent", "Dever"])
19
+ self.assertEqual(dic, dic2)
20
+
21
+ dic = file2Dictionary("./tests/dictionaryExample.txt")
22
+ self.assertEqual(dic.length(), 5)
23
+ self.assertEqual(dic.to_string(), "M. Cyril Antoine Laurent Dever")
@@ -0,0 +1,71 @@
1
+ from unittest import TestCase
2
+ from feistel import FPECipher, BLAKE2B, KECCAK, SHA_256
3
+
4
+
5
+ from redacted import Dictionary, DefaultRedactor, RedactorWithDictionary
6
+
7
+
8
+ class TestRedactor(TestCase):
9
+ def test_dictionary_redactor(self):
10
+ dic = Dictionary(["M.", "Cyril", "Antoine", "Laurent", "Dever"])
11
+ ref = "B6ds. is testing ¼= Du:,l26 library while ¾.=y£|v Izizb is listening to Âvhis*l<"
12
+
13
+ txt = "Cyril is testing M. Dever's library while Antoine Dever is listening to Laurent."
14
+ redactor = RedactorWithDictionary(
15
+ dic,
16
+ FPECipher(
17
+ SHA_256,
18
+ "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef",
19
+ 10,
20
+ ),
21
+ )
22
+ redacted = redactor.redact(txt)
23
+ self.assertEqual(redacted, ref)
24
+ expanded = redactor.expand(redacted)
25
+ self.assertEqual(expanded, txt)
26
+
27
+ blake2 = "¸lk€$ is testing F: B!@x7;1 library while Cs>v0'* ¹'90< is listening to Pz2;ws?o"
28
+ redacted = RedactorWithDictionary(
29
+ dic,
30
+ FPECipher(
31
+ BLAKE2B,
32
+ "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef",
33
+ 10,
34
+ ),
35
+ ).redact(txt)
36
+ self.assertEqual(redacted, blake2)
37
+ self.assertTrue(redacted != ref)
38
+
39
+ keccak = "H1i,{ is testing ½5 ¿&bv8f8 library while ¸&7+r$u ¹|6'h is listening to Å€j;$\"4<"
40
+ redacted = RedactorWithDictionary(
41
+ dic,
42
+ FPECipher(
43
+ KECCAK,
44
+ "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef",
45
+ 10,
46
+ ),
47
+ ).redact(txt)
48
+ self.assertEqual(redacted, keccak)
49
+ self.assertTrue(redacted != ref)
50
+
51
+ def test_tag_redactor(self):
52
+ ref = "~B6ds. is testing ~¼= ~Du:,l26 library while ~¾.=y£|v ~Izizb is listening to ~Âvhis*l<"
53
+
54
+ txt = "~Cyril is testing ~M. ~Dever's library while ~Antoine ~Dever is listening to ~Laurent."
55
+ redactor = DefaultRedactor(
56
+ FPECipher(
57
+ SHA_256,
58
+ "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef",
59
+ 10,
60
+ )
61
+ )
62
+ redacted = redactor.redact(txt)
63
+ self.assertEqual(redacted, ref)
64
+ expanded = redactor.expand(redacted)
65
+ self.assertEqual(expanded, txt)
66
+
67
+ cleansed = redactor.clean(expanded)
68
+ self.assertEqual(
69
+ cleansed,
70
+ "Cyril is testing M. Dever's library while Antoine Dever is listening to Laurent.",
71
+ )