exeplot 0.4.3__tar.gz → 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {exeplot-0.4.3 → exeplot-0.5.4}/.coveragerc +1 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/.github/workflows/python-package.yml +1 -1
- {exeplot-0.4.3 → exeplot-0.5.4}/PKG-INFO +3 -3
- {exeplot-0.4.3 → exeplot-0.5.4}/README.md +1 -1
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/coverage.svg +1 -1
- exeplot-0.5.4/docs/pages/img/upx_calc_byte.png +0 -0
- exeplot-0.5.4/docs/pages/img/upx_calc_entropy.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/pyproject.toml +1 -1
- exeplot-0.5.4/src/exeplot/VERSION.txt +1 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/__conf__.py +3 -3
- exeplot-0.5.4/src/exeplot/__init__.py +9 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/__main__.py +5 -4
- exeplot-0.5.4/src/exeplot/utils.py +91 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot.egg-info/PKG-INFO +3 -3
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot.egg-info/SOURCES.txt +2 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/tests/test_others.py +7 -5
- exeplot-0.4.3/src/exeplot/VERSION.txt +0 -1
- exeplot-0.4.3/src/exeplot/__init__.py +0 -5
- exeplot-0.4.3/src/exeplot/utils.py +0 -67
- {exeplot-0.4.3 → exeplot-0.5.4}/.gitignore +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/.readthedocs.yml +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/LICENSE +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/_config.yml +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/mkdocs.yml +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/css/extra.css +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/img/calc_orig_entropy.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/img/calc_packed_byte.png +0 -0
- /exeplot-0.4.3/docs/pages/img/upx_calc_byte.png → /exeplot-0.5.4/docs/pages/img/calc_packed_byte2.png +0 -0
- /exeplot-0.4.3/docs/pages/img/upx_calc_entropy.png → /exeplot-0.5.4/docs/pages/img/calc_packed_entropy.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/img/calc_packed_nested_pie.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/img/calc_packed_pie.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/img/icon.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/img/logo.png +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/pages/index.md +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/docs/requirements.txt +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/pytest.ini +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/requirements.txt +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/setup.cfg +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/__info__.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/__common__.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/__init__.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/byte.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/diff.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/entropy.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/graph.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/nested_pie.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot/plots/pie.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot.egg-info/dependency_links.txt +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot.egg-info/entry_points.txt +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot.egg-info/requires.txt +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/src/exeplot.egg-info/top_level.txt +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/tests/__init__.py +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/tests/hello.elf +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/tests/hello.exe +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/tests/hello.macho +0 -0
- {exeplot-0.4.3 → exeplot-0.5.4}/tests/test_plots.py +0 -0
|
@@ -19,7 +19,7 @@ jobs:
|
|
|
19
19
|
fail-fast: false
|
|
20
20
|
matrix:
|
|
21
21
|
os: [ubuntu-latest]
|
|
22
|
-
python-version: ["3.
|
|
22
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
23
23
|
steps:
|
|
24
24
|
- uses: actions/checkout@v3
|
|
25
25
|
- name: Set up Python ${{ matrix.python-version }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: exeplot
|
|
3
|
-
Version: 0.4
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: Library for plotting executable samples supporting multiple formats
|
|
5
5
|
Author-email: Alexandre D'Hondt <alexandre.dhondt@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -689,7 +689,7 @@ Classifier: Intended Audience :: Developers
|
|
|
689
689
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
690
690
|
Classifier: Programming Language :: Python :: 3
|
|
691
691
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
692
|
-
Requires-Python: <4,>=3.
|
|
692
|
+
Requires-Python: <4,>=3.10
|
|
693
693
|
Description-Content-Type: text/markdown
|
|
694
694
|
License-File: LICENSE
|
|
695
695
|
Requires-Dist: lief>=0.16.1
|
|
@@ -736,7 +736,7 @@ Draw a simplified byte plot of `calc_packed.exe`:
|
|
|
736
736
|
$ exeplot byte calc_packed.exe --no-title --no-legend
|
|
737
737
|
```
|
|
738
738
|
|
|
739
|
-

|
|
740
740
|
|
|
741
741
|
Draw a pie plot of `calc_packed.exe`:
|
|
742
742
|
|
|
@@ -32,7 +32,7 @@ Draw a simplified byte plot of `calc_packed.exe`:
|
|
|
32
32
|
$ exeplot byte calc_packed.exe --no-title --no-legend
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
-

|
|
36
36
|
|
|
37
37
|
Draw a pie plot of `calc_packed.exe`:
|
|
38
38
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<svg xmlns="http://www.w3.org/2000/svg" width="114" height="20" role="img" aria-label="coverage: 96.
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="114" height="20" role="img" aria-label="coverage: 96.62%"><title>coverage: 96.62%</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="114" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="61" height="20" fill="#555"/><rect x="61" width="53" height="20" fill="#4c1"/><rect width="114" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="315" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="510">coverage</text><text x="315" y="140" transform="scale(.1)" fill="#fff" textLength="510">coverage</text><text aria-hidden="true" x="865" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="430">96.62%</text><text x="865" y="140" transform="scale(.1)" fill="#fff" textLength="430">96.62%</text></g></svg>
|
|
Binary file
|
|
Binary file
|
|
@@ -22,7 +22,7 @@ authors = [
|
|
|
22
22
|
description = "Library for plotting executable samples supporting multiple formats"
|
|
23
23
|
license = {file = "LICENSE"}
|
|
24
24
|
keywords = ["python", "development", "programming", "executable-samples", "plot", "entropy", "cfg"]
|
|
25
|
-
requires-python = ">=3.
|
|
25
|
+
requires-python = ">=3.10,<4"
|
|
26
26
|
classifiers = [
|
|
27
27
|
"Development Status :: 5 - Production/Stable",
|
|
28
28
|
"Environment :: Console",
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.5.4
|
|
@@ -29,7 +29,7 @@ config = {
|
|
|
29
29
|
numpy.int = numpy.int_ # dirty fix to "AttributeError: module 'numpy' has no attribute 'int'."
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def check_imports(*names):
|
|
32
|
+
def check_imports(*names) -> None:
|
|
33
33
|
import warnings
|
|
34
34
|
from inspect import currentframe
|
|
35
35
|
glob = currentframe().f_back.f_globals
|
|
@@ -42,7 +42,7 @@ def check_imports(*names):
|
|
|
42
42
|
glob['_IMP'] = False
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def configure(): # pragma: no cover
|
|
45
|
+
def configure() -> None: # pragma: no cover
|
|
46
46
|
from configparser import ConfigParser
|
|
47
47
|
from os.path import exists, expanduser
|
|
48
48
|
path = expanduser("~/.exeplot.conf")
|
|
@@ -58,7 +58,7 @@ def configure(): # pragma: no cover
|
|
|
58
58
|
plt.rcParams['font.family'] = config['font_family']
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def configure_fonts(**kw):
|
|
61
|
+
def configure_fonts(**kw) -> dict:
|
|
62
62
|
import matplotlib
|
|
63
63
|
matplotlib.rc('font', **{k.split("_")[1]: kw.pop(k, config[k]) for k in ['font_family', 'font_size']})
|
|
64
64
|
kw['title-font'] = {'fontfamily': kw.pop('title_font_family', config['font_family']),
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
|
2
|
+
from argparse import ArgumentParser, Namespace, RawTextHelpFormatter
|
|
3
|
+
|
|
2
4
|
from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__
|
|
3
5
|
from .__init__ import *
|
|
4
6
|
from .__init__ import __all__ as _plots
|
|
5
7
|
|
|
6
8
|
|
|
7
|
-
def _parser(name, description, examples):
|
|
8
|
-
from argparse import ArgumentParser, RawTextHelpFormatter
|
|
9
|
+
def _parser(name: str, description: str, examples: list[str]) -> ArgumentParser:
|
|
9
10
|
descr = f"{name} {__version__}\n\nAuthor : {__author__} ({__email__})\nCopyright: {__copyright__}\nLicense :" \
|
|
10
11
|
f" {__license__}\nSource : {__source__}\n\n{description}.\n\n"
|
|
11
12
|
examples = [f"exeplot {e}" if not e.startswith("exeplot ") else e for e in examples]
|
|
@@ -13,7 +14,7 @@ def _parser(name, description, examples):
|
|
|
13
14
|
epilog="usage examples:\n " + "\n ".join(examples) if len(examples) > 0 else None)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def _setup(parser): # pragma: no cover
|
|
17
|
+
def _setup(parser: ArgumentParser) -> Namespace: # pragma: no cover
|
|
17
18
|
args = parser.parse_args()
|
|
18
19
|
if hasattr(args, "verbose"):
|
|
19
20
|
import logging
|
|
@@ -22,7 +23,7 @@ def _setup(parser): # pragma: no cover
|
|
|
22
23
|
return args
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def main():
|
|
26
|
+
def main() -> None: # pragma: no cover
|
|
26
27
|
from os import makedirs
|
|
27
28
|
parser = _parser("Exeplot", "This tool allows to plot executable sample(s) in different ways",
|
|
28
29
|
["byte binary.exe", "entropy binary1.exe binary2.exe --scale"])
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
import numpy as np
|
|
3
|
+
from math import log2
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
__all__ = ["ensure_str", "human_readable_size", "ngrams_counts", "ngrams_distribution", "shannon_entropy"]
|
|
8
|
+
|
|
9
|
+
shannon_entropy = lambda b: -sum([p*log2(p) for p in [float(ctr)/len(b) for ctr in [b.count(c) for c in set(b)]]]) or 0.
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def ensure_str(s: str | bytes, encoding: str = "utf-8", errors: str = "strict") -> str:
|
|
13
|
+
""" Ensure that an input string is decoded. """
|
|
14
|
+
if isinstance(s, bytes):
|
|
15
|
+
try:
|
|
16
|
+
return s.decode(encoding, errors)
|
|
17
|
+
except:
|
|
18
|
+
return s.decode("latin-1")
|
|
19
|
+
elif not isinstance(s, (str, bytes)):
|
|
20
|
+
raise TypeError("not expecting type '%s'" % type(s))
|
|
21
|
+
return s
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def human_readable_size(size: int, precision: int = 0) -> str:
|
|
25
|
+
""" Display bytes' size in a human-readable format given a precision. """
|
|
26
|
+
i, units = 0, ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
|
|
27
|
+
while size >= 1024 and i < len(units)-1:
|
|
28
|
+
i += 1
|
|
29
|
+
size /= 1024.0
|
|
30
|
+
return "%.*f%s" % (precision, size, units[i])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> list[tuple[bytes, int]]:
|
|
34
|
+
""" Output a sorted list of tuples (n-gram, counts) for an input byte sequence or byte object.
|
|
35
|
+
If the input is a byte object, the result is cached.
|
|
36
|
+
|
|
37
|
+
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
|
|
38
|
+
:param n: n determining the size of n-grams, defaults to 1
|
|
39
|
+
:param step: step for sliding the n-grams
|
|
40
|
+
"""
|
|
41
|
+
if n not in (1, 2, 3):
|
|
42
|
+
raise ValueError("n must be 1, 2, or 3")
|
|
43
|
+
if step <= 0:
|
|
44
|
+
raise ValueError("step must be positive")
|
|
45
|
+
try:
|
|
46
|
+
return byte_obj._ngram_counts_cache[n]
|
|
47
|
+
except (AttributeError, KeyError):
|
|
48
|
+
pass
|
|
49
|
+
if isinstance(byte_obj, bytes) or hasattr(byte_obj, "bytes"):
|
|
50
|
+
a = np.frombuffer(data := byte_obj if isinstance(byte_obj, bytes) else byte_obj.bytes, dtype=np.uint8)
|
|
51
|
+
l = a.size
|
|
52
|
+
if l < n:
|
|
53
|
+
return {}
|
|
54
|
+
if n == 1:
|
|
55
|
+
counts = {b.to_bytes(1, "big"): int(c) for b, c in \
|
|
56
|
+
enumerate(np.bincount(np.frombuffer(data, dtype=np.uint8)))}
|
|
57
|
+
else:
|
|
58
|
+
end = (m := (l - n) // step + 1) * step
|
|
59
|
+
grams = np.stack((a[0:end:step], a[1:1+end:step]), axis=1) if n == 2 else \
|
|
60
|
+
np.stack((a[0:end:step], a[1:1+end:step], a[2:2+end:step]), axis=1)
|
|
61
|
+
counts = {bytes(row): int(c) for row, c in zip(*np.unique(grams, axis=0, return_counts=True))}
|
|
62
|
+
counts = sorted(counts.items(), key=lambda p: p[1], reverse=True)
|
|
63
|
+
if isinstance(byte_obj, bytes):
|
|
64
|
+
return counts
|
|
65
|
+
elif hasattr(byte_obj, "bytes"):
|
|
66
|
+
if not hasattr(byte_obj, "_ngram_counts_cache"):
|
|
67
|
+
byte_obj._ngram_counts_cache = {}
|
|
68
|
+
if n not in byte_obj._ngram_counts_cache.keys():
|
|
69
|
+
byte_obj._ngram_counts_cache[n] = counts
|
|
70
|
+
return byte_obj._ngram_counts_cache[n]
|
|
71
|
+
raise TypeError("Bad input type ; should be a byte sequence or object")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def ngrams_distribution(byte_obj: bytes | object, n: int = 1, step: int = 1, n_most_common: Optional[int] = None,
|
|
75
|
+
n_exclude_top: int = 0, exclude: Optional[list] = None) -> list[tuple[bytes, int]]:
|
|
76
|
+
""" Compute the n-grams distribution of an input byte sequence or byte object given exclusions.
|
|
77
|
+
|
|
78
|
+
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
|
|
79
|
+
:param n: n determining the size of n-grams, defaults to 1
|
|
80
|
+
:param step: step for sliding the n-grams
|
|
81
|
+
:param n_most_common: number of n-grams to be kept in the result, keep all by default
|
|
82
|
+
:param n_exclude_top: number of n-grams to be excluded from the top of the histogram, no exclusion by default
|
|
83
|
+
:param exclude: list of specific n-grams to be excluded, no exclusion by default
|
|
84
|
+
:return: list of n_most_common (n-gram, count) pairs
|
|
85
|
+
"""
|
|
86
|
+
c = ngrams_counts(byte_obj, n, step)
|
|
87
|
+
r = c[:len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])]
|
|
88
|
+
if exclude is not None:
|
|
89
|
+
r = [(ngram, count) for ngram, count in r if ngram not in exclude]
|
|
90
|
+
return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]
|
|
91
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: exeplot
|
|
3
|
-
Version: 0.4
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: Library for plotting executable samples supporting multiple formats
|
|
5
5
|
Author-email: Alexandre D'Hondt <alexandre.dhondt@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -689,7 +689,7 @@ Classifier: Intended Audience :: Developers
|
|
|
689
689
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
690
690
|
Classifier: Programming Language :: Python :: 3
|
|
691
691
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
692
|
-
Requires-Python: <4,>=3.
|
|
692
|
+
Requires-Python: <4,>=3.10
|
|
693
693
|
Description-Content-Type: text/markdown
|
|
694
694
|
License-File: LICENSE
|
|
695
695
|
Requires-Dist: lief>=0.16.1
|
|
@@ -736,7 +736,7 @@ Draw a simplified byte plot of `calc_packed.exe`:
|
|
|
736
736
|
$ exeplot byte calc_packed.exe --no-title --no-legend
|
|
737
737
|
```
|
|
738
738
|
|
|
739
|
-

|
|
740
740
|
|
|
741
741
|
Draw a pie plot of `calc_packed.exe`:
|
|
742
742
|
|
|
@@ -15,6 +15,8 @@ docs/pages/index.md
|
|
|
15
15
|
docs/pages/css/extra.css
|
|
16
16
|
docs/pages/img/calc_orig_entropy.png
|
|
17
17
|
docs/pages/img/calc_packed_byte.png
|
|
18
|
+
docs/pages/img/calc_packed_byte2.png
|
|
19
|
+
docs/pages/img/calc_packed_entropy.png
|
|
18
20
|
docs/pages/img/calc_packed_nested_pie.png
|
|
19
21
|
docs/pages/img/calc_packed_pie.png
|
|
20
22
|
docs/pages/img/icon.png
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
# -*- coding: UTF-8 -*-
|
|
3
3
|
import matplotlib.pyplot as plt
|
|
4
4
|
import os
|
|
5
|
-
from collections import Counter
|
|
6
5
|
from exeplot.plots.__common__ import Binary
|
|
7
6
|
from exeplot.utils import *
|
|
8
7
|
from unittest import TestCase
|
|
@@ -21,12 +20,15 @@ class TestOthers(TestCase):
|
|
|
21
20
|
class TestUtils(TestCase):
|
|
22
21
|
def test_ngrams_functions(self):
|
|
23
22
|
self.assertRaises(TypeError, ngrams_counts, 123)
|
|
24
|
-
|
|
23
|
+
for n in [0, 4]:
|
|
24
|
+
self.assertRaises(ValueError, ngrams_counts, b"abc", n=n)
|
|
25
|
+
self.assertRaises(ValueError, ngrams_counts, b"abc", step=-1)
|
|
26
|
+
self.assertEqual(ngrams_counts(b"a", n=2), {})
|
|
27
|
+
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), list))
|
|
28
|
+
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), list))
|
|
25
29
|
class Test:
|
|
26
30
|
bytes = seq
|
|
27
|
-
|
|
28
|
-
histogram = ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff"))
|
|
29
|
-
self.assertTrue(isinstance(histogram, list))
|
|
31
|
+
self.assertTrue(isinstance(histogram := ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff")), list))
|
|
30
32
|
self.assertNotIn(b"\x00", [b for b, c in histogram])
|
|
31
33
|
self.assertNotIn(b"\xff", [b for b, c in histogram])
|
|
32
34
|
histogram2 = ngrams_distribution(t, n_most_common=300)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.4.3
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
|
2
|
-
from math import log2
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
__all__ = ["ensure_str", "human_readable_size", "ngrams_counts", "ngrams_distribution", "shannon_entropy"]
|
|
6
|
-
|
|
7
|
-
shannon_entropy = lambda b: -sum([p*log2(p) for p in [float(ctr)/len(b) for ctr in [b.count(c) for c in set(b)]]]) or 0.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def ensure_str(s, encoding='utf-8', errors='strict'):
|
|
11
|
-
""" Ensure that an input string is decoded. """
|
|
12
|
-
if isinstance(s, bytes):
|
|
13
|
-
try:
|
|
14
|
-
return s.decode(encoding, errors)
|
|
15
|
-
except:
|
|
16
|
-
return s.decode("latin-1")
|
|
17
|
-
elif not isinstance(s, (str, bytes)):
|
|
18
|
-
raise TypeError("not expecting type '%s'" % type(s))
|
|
19
|
-
return s
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def human_readable_size(size, precision=0):
|
|
23
|
-
""" Display bytes' size in a human-readable format given a precision. """
|
|
24
|
-
i, units = 0, ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
|
|
25
|
-
while size >= 1024 and i < len(units)-1:
|
|
26
|
-
i += 1
|
|
27
|
-
size /= 1024.0
|
|
28
|
-
return "%.*f%s" % (precision, size, units[i])
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def ngrams_counts(byte_obj, n=1, step=1):
|
|
32
|
-
""" Output the Counter instance for an input byte sequence or byte object based on n-grams.
|
|
33
|
-
If the input is a byte object, cache the result.
|
|
34
|
-
|
|
35
|
-
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
|
|
36
|
-
:param n: n determining the size of n-grams, defaults to 1
|
|
37
|
-
:param step: step for sliding the n-grams
|
|
38
|
-
"""
|
|
39
|
-
from collections import Counter
|
|
40
|
-
if isinstance(byte_obj, (str, bytes)):
|
|
41
|
-
return Counter(byte_obj[i:i+n] for i in range(0, len(byte_obj)-n+1, step))
|
|
42
|
-
elif hasattr(byte_obj, "bytes") and hasattr(byte_obj, "size"):
|
|
43
|
-
if not hasattr(byte_obj, "_ngram_counts_cache"):
|
|
44
|
-
byte_obj._ngram_counts_cache = {}
|
|
45
|
-
if n not in byte_obj._ngram_counts_cache.keys():
|
|
46
|
-
byte_obj._ngram_counts_cache[n] = Counter(byte_obj.bytes[i:i+n] for i in range(0, byte_obj.size-n+1, step))
|
|
47
|
-
return byte_obj._ngram_counts_cache[n]
|
|
48
|
-
raise TypeError("Bad input type ; should be a byte sequence or object")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def ngrams_distribution(byte_obj, n=1, step=1, n_most_common=None, n_exclude_top=0, exclude=None):
|
|
52
|
-
""" Compute the n-grams distribution of an input byte sequence or byte object given exclusions.
|
|
53
|
-
|
|
54
|
-
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
|
|
55
|
-
:param n: n determining the size of n-grams, defaults to 1
|
|
56
|
-
:param step: step for sliding the n-grams
|
|
57
|
-
:param n_most_common: number of n-grams to be kept in the result, keep all by default
|
|
58
|
-
:param n_exclude_top: number of n-grams to be excluded from the top of the histogram, no exclusion by default
|
|
59
|
-
:param exclude: list of specific n-grams to be excluded, no exclusion by default
|
|
60
|
-
:return: list of n_most_common (n-gram, count) pairs
|
|
61
|
-
"""
|
|
62
|
-
c = ngrams_counts(byte_obj, n, step)
|
|
63
|
-
r = c.most_common(len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or []))
|
|
64
|
-
if exclude is not None:
|
|
65
|
-
r = [(ngram, count) for ngram, count in r if ngram not in exclude]
|
|
66
|
-
return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]
|
|
67
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|