PyPI - maco - Versions diffs - 1.2.17__py3-none-any.whl → 1.2.18__py3-none-any.whl - Mend

maco 1.2.17py3-none-any.whl → 1.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

extractor_setup/maco/__init__.py +0 -0
extractor_setup/maco/base_test.py +98 -0
extractor_setup/maco/cli.py +275 -0
extractor_setup/maco/collector.py +220 -0
extractor_setup/maco/exceptions.py +33 -0
extractor_setup/maco/extractor.py +70 -0
extractor_setup/maco/model/__init__.py +1 -0
extractor_setup/maco/model/model.py +606 -0
extractor_setup/maco/utils.py +587 -0
extractor_setup/maco/yara.py +129 -0
maco/utils.py +2 -2
{maco-1.2.17.dist-info → maco-1.2.18.dist-info}/METADATA +2 -1
{maco-1.2.17.dist-info → maco-1.2.18.dist-info}/RECORD +19 -9
{maco-1.2.17.dist-info → maco-1.2.18.dist-info}/top_level.txt +1 -0
model_setup/maco/utils.py +2 -2
pipelines/publish.yaml +30 -24
{maco-1.2.17.dist-info → maco-1.2.18.dist-info}/WHEEL +0 -0
{maco-1.2.17.dist-info → maco-1.2.18.dist-info}/entry_points.txt +0 -0
{maco-1.2.17.dist-info → maco-1.2.18.dist-info}/licenses/LICENSE.md +0 -0

extractor_setup/maco/__init__.py ADDED Viewed

File without changes

extractor_setup/maco/base_test.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Foundation for unit testing an extractor.
+Example:
+from maco import base_test
+class TestExample(base_test.BaseTest):
+    name = "Example"
+    path = os.path.join(__file__, "../../extractors")
+    def test_run(self):
+        data = b"data with Example information"
+        ret = self.extract(io.BytesIO(data))
+        self.assertEqual(ret["family"], "example")
+"""
+import importlib
+import io
+import os
+import unittest
+import cart
+from maco import collector
+from maco.exceptions import NoHitException
+class BaseTest(unittest.TestCase):
+    """Base test class."""
+    name: str = None  # name of the extractor
+    # folder and/or file where extractor is.
+    # I recommend something like os.path.join(__file__, "../../extractors")
+    # if your extractors are in a folder 'extractors' next to a folder of tests
+    path: str = None
+    create_venv: bool = False
+    @classmethod
+    def setUpClass(cls) -> None:
+        """Initialization of class.
+        Raises:
+            Exception: when name or path is not set.
+        """
+        if not cls.name or not cls.path:
+            raise Exception("name and path must be set")
+        cls.c = collector.Collector(cls.path, include=[cls.name], create_venv=cls.create_venv)
+        return super().setUpClass()
+    def test_default_metadata(self):
+        """Require extractor to be loadable and valid."""
+        self.assertIn(self.name, self.c.extractors)
+        self.assertEqual(len(self.c.extractors), 1)
+    def extract(self, stream):
+        """Return results for running extractor over stream, including yara check.
+        Raises:
+            NoHitException: when yara rule doesn't hit.
+        """
+        runs = self.c.match(stream)
+        if not runs:
+            raise NoHitException("no yara rule hit")
+        resp = self.c.extract(stream, self.name)
+        return resp
+    @classmethod
+    def _get_location(cls) -> str:
+        """Return path to child class that implements this class."""
+        # import child module
+        module = cls.__module__
+        i = importlib.import_module(module)
+        # get location to child module
+        return i.__file__
+    @classmethod
+    def load_cart(cls, filepath: str) -> io.BytesIO:
+        """Load and unneuter a test file (likely malware) into memory for processing.
+        Args:
+            filepath (str): Path to carted sample
+        Returns:
+            (io.BytesIO): Buffered stream containing the un-carted sample
+        Raises:
+            FileNotFoundError: if the path to the sample doesn't exist
+        """
+        # it is nice if we can load files relative to whatever is implementing base_test
+        dirpath = os.path.split(cls._get_location())[0]
+        # either filepath is absolute, or should be loaded relative to child of base_test
+        filepath = os.path.join(dirpath, filepath)
+        if not os.path.isfile(filepath):
+            raise FileNotFoundError(filepath)
+        with open(filepath, "rb") as f:
+            unpacked = io.BytesIO()
+            # just bubble exceptions if it isn't cart
+            cart.unpack_stream(f, unpacked)
+        # seek to start of the unneutered stream
+        unpacked.seek(0)
+        return unpacked

extractor_setup/maco/cli.py ADDED Viewed

@@ -0,0 +1,275 @@
+"""CLI example of how extractors can be executed."""
+import argparse
+import base64
+import binascii
+import hashlib
+import io
+import json
+import logging
+import os
+import sys
+from importlib.metadata import version
+from typing import BinaryIO, List, Tuple
+import cart
+from maco import collector
+logger = logging.getLogger("maco.lib.cli")
+def process_file(
+    collected: collector.Collector,
+    path_file: str,
+    stream: BinaryIO,
+    *,
+    pretty: bool,
+    force: bool,
+    include_base64: bool,
+):
+    """Process a filestream with the extractors and rules.
+    Args:
+        collected (collector.Collector): a Collector instance
+        path_file (str): path to sample to be analyzed
+        stream (BinaryIO): binary stream to be analyzed
+        pretty (bool): Pretty print the JSON output
+        force (bool): Run all extractors regardless of YARA rule match
+        include_base64 (bool): include base64'd data in output
+    Returns:
+        (dict): The output from the extractors analyzing the sample
+    """
+    unneutered = io.BytesIO()
+    try:
+        cart.unpack_stream(stream, unneutered)
+    except Exception:
+        # use original stream if anything goes wrong here
+        # i.e. invalid/malformed cart
+        pass
+    else:
+        # use unneutered stream
+        stream = unneutered
+    # unpack will read some bytes either way so reset position
+    stream.seek(0)
+    # find extractors that should run based on yara rules
+    if not force:
+        runs = collected.match(stream)
+    else:
+        # execute all extractors with no yara information
+        # note - extractors may rely on a yara hit so this may cause errors
+        runs = {x: [] for x in collected.extractors.keys()}
+    if not runs:
+        return
+    # run extractor for the set of hits
+    logger.info(f"path: {path_file}")
+    ret = {}
+    for extractor_name, hits in runs.items():
+        # run and store results for extractor
+        logger.info(f"run {extractor_name} extractor from rules {[x.rule for x in hits]}")
+        try:
+            resp = collected.extract(stream, extractor_name)
+        except Exception as e:
+            logger.exception(f"extractor error with {path_file} ({e})")
+            resp = None
+        # encode binary data so we can print as json
+        if resp:
+            for row in resp.get("binaries", []):
+                row["sha256"] = hashlib.sha256(row["data"]).hexdigest()
+                # number of bytes in the binary
+                row["size"] = len(row["data"])
+                # small sample of first part of binary
+                row["hex_sample"] = binascii.hexlify(row["data"][:32]).decode("utf8").upper()
+                if include_base64:
+                    # this can be large
+                    row["base64"] = base64.b64encode(row["data"]).decode("utf8")
+                # do not print raw bytes to console
+                row.pop("data")
+        ret[extractor_name] = resp
+        logger.info(json.dumps(resp, indent=2 if pretty else None))
+    logger.info("")
+    return ret
+def process_filesystem(
+    path_extractors: str,
+    path_samples: str,
+    include: List[str],
+    exclude: List[str],
+    *,
+    pretty: bool,
+    force: bool,
+    include_base64: bool,
+    create_venv: bool = False,
+    skip_install: bool = False,
+) -> Tuple[int, int, int]:
+    """Process filesystem with extractors and print results of extraction.
+    Returns:
+        (Tuple[int, int, int]): Total number of analysed files, yara hits and successful maco extractions.
+    """
+    if force:
+        logger.warning("force execute will cause errors if an extractor requires a yara rule hit during execution")
+    collected = collector.Collector(
+        path_extractors, include=include, exclude=exclude, create_venv=create_venv, skip_install=skip_install
+    )
+    logger.info(f"extractors loaded: {[x for x in collected.extractors.keys()]}\n")
+    for _, extractor in collected.extractors.items():
+        extractor_meta = extractor["metadata"]
+        logger.info(
+            f"{extractor_meta['family']} by {extractor_meta['author']}"
+            f" {extractor_meta['last_modified']} {extractor_meta['sharing']}"
+            f"\n{extractor_meta['description']}\n"
+        )
+    num_analysed = 0
+    num_hits = 0
+    num_extracted = 0
+    if os.path.isfile(path_samples):
+        # analyse a single file
+        walker = [("", None, [path_samples])]
+    elif os.path.isdir(path_samples):
+        # load files from directory tree
+        walker = os.walk(path_samples)
+    else:
+        logger.error(f"not file or folder: {path_samples}")
+        exit(2)
+    try:
+        base_directory = os.path.abspath(path_samples)
+        for path, _, files in walker:
+            for file in files:
+                num_analysed += 1
+                path_file = os.path.abspath(os.path.join(path, file))
+                if not path_file.startswith(base_directory):
+                    logger.error(f"Attempted path traversal detected: {path_file}")
+                    continue
+                try:
+                    with open(path_file, "rb") as stream:
+                        resp = process_file(
+                            collected,
+                            path_file,
+                            stream,
+                            pretty=pretty,
+                            force=force,
+                            include_base64=include_base64,
+                        )
+                        if resp:
+                            num_hits += 1
+                            if any(x for x in resp.values()):
+                                num_extracted += 1
+                except Exception as e:
+                    logger.exception(f"file error with {path_file} ({e})")
+                    continue
+    except:
+        raise
+    finally:
+        logger.info("")
+        logger.info(f"{num_analysed} analysed, {num_hits} hits, {num_extracted} extracted")
+    return num_analysed, num_hits, num_extracted
+def main():
+    """Main block for CLI."""
+    parser = argparse.ArgumentParser(description="Run extractors over samples.")
+    parser.add_argument("extractors", type=str, help="path to extractors")
+    parser.add_argument("samples", type=str, help="path to samples")
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="print debug logging. -v extractor info, -vv extractor debug, -vvv cli debug",
+    )
+    parser.add_argument("--pretty", action="store_true", help="pretty print json output")
+    parser.add_argument(
+        "--base64",
+        action="store_true",
+        help="Include base64 encoded binary data in output "
+        "(can be large, consider printing to file rather than console)",
+    )
+    parser.add_argument("--logfile", type=str, help="file to log output")
+    parser.add_argument("--include", type=str, help="comma separated extractors to run")
+    parser.add_argument("--exclude", type=str, help="comma separated extractors to not run")
+    parser.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        help="ignore yara rules and execute all extractors",
+    )
+    parser.add_argument(
+        "--create_venv",
+        action="store_true",
+        help="Creates venvs for every requirements.txt found (only applies when extractor path is a directory). "
+        "This runs much slower than the alternative but may be necessary "
+        "when there are many extractors with conflicting dependencies.",
+    )
+    parser.add_argument(
+        "--force_install",
+        action="store_true",
+        help="Force installation of Python dependencies for extractors (in both host and virtual environments).",
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=f"version: {version('maco')}",
+        help="Show version of MACO",
+    )
+    args = parser.parse_args()
+    inc = args.include.split(",") if args.include else []
+    exc = args.exclude.split(",") if args.exclude else []
+    # set up logging for lib, only show debug with 3+ verbose
+    logger_lib = logging.getLogger("maco.lib")
+    logger_lib.setLevel(logging.DEBUG if args.verbose > 2 else logging.INFO)
+    ch = logging.StreamHandler(sys.stdout)
+    ch.setLevel(logging.DEBUG)
+    logger_lib.addHandler(ch)
+    # set up logging for extractor
+    logger_ex = logging.getLogger("maco.extractor")
+    if args.verbose == 0:
+        logger_ex.setLevel(logging.WARNING)
+    elif args.verbose == 1:
+        logger_ex.setLevel(logging.INFO)
+    else:
+        logger_ex.setLevel(logging.DEBUG)
+    ch = logging.StreamHandler(sys.stdout)
+    ch.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        fmt="%(asctime)s, [%(levelname)s] %(module)s.%(funcName)s: %(message)s", datefmt="%Y-%m-%d (%H:%M:%S)"
+    )
+    ch.setFormatter(formatter)
+    logger_ex.addHandler(ch)
+    # log everything to file
+    if args.logfile:
+        logger = logging.getLogger("maco")
+        logger_lib.setLevel(logging.DEBUG)
+        fh = logging.FileHandler(args.logfile)
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+    process_filesystem(
+        args.extractors,
+        args.samples,
+        inc,
+        exc,
+        pretty=args.pretty,
+        force=args.force,
+        include_base64=args.base64,
+        create_venv=args.create_venv,
+        skip_install=not args.force_install,
+    )
+if __name__ == "__main__":
+    main()

extractor_setup/maco/collector.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""Convenience functions for discovering your extractors."""
+import inspect
+import logging
+import logging.handlers
+import os
+import sys
+from tempfile import NamedTemporaryFile
+from types import ModuleType
+from typing import Any, BinaryIO, Dict, List, TypedDict, Union
+from multiprocess import Manager, Process, Queue
+from pydantic import BaseModel
+from maco import extractor, model, utils, yara
+from maco.exceptions import AnalysisAbortedException, ExtractorLoadError
+logger = logging.getLogger("maco.lib.helpers")
+def _verify_response(resp: Union[BaseModel, dict]) -> Dict:
+    """Enforce types and verify properties, and remove defaults.
+    Args:
+        resp (Union[BaseModel, dict])): results from extractor
+    Returns:
+        (Dict): results from extractor after verification
+    """
+    if not resp:
+        return None
+    # check the response is valid for its own model
+    # this is useful if a restriction on the 'other' dictionary is needed
+    resp_model = type(resp)
+    if resp_model != model.ExtractorModel and hasattr(resp_model, "model_validate"):
+        resp = resp_model.model_validate(resp)
+    # check the response is valid according to the ExtractorModel
+    resp = model.ExtractorModel.model_validate(resp)
+    # coerce sets to correct types
+    # otherwise we end up with sets where we expect lists
+    resp = model.ExtractorModel(**resp.model_dump())
+    # dump model to dict
+    return resp.model_dump(exclude_defaults=True)
+class ExtractorMetadata(TypedDict):
+    """Extractor-supplied metadata."""
+    author: str
+    family: str
+    last_modified: str
+    sharing: str
+    description: str
+class ExtractorRegistration(TypedDict):
+    """Registration collected by the collector for a single extractor."""
+    venv: str
+    module_path: str
+    module_name: str
+    extractor_class: str
+    metadata: ExtractorMetadata
+class Collector:
+    """Discover and load extractors from file system."""
+    def __init__(
+        self,
+        path_extractors: str,
+        include: List[str] = None,
+        exclude: List[str] = None,
+        create_venv: bool = False,
+        skip_install: bool = False,
+    ):
+        """Discover and load extractors from file system.
+        Raises:
+            ExtractorLoadError: when no extractors are found
+        """
+        # maco requires the extractor to be imported directly, so ensure they are available on the path
+        full_path_extractors = os.path.abspath(path_extractors)
+        full_path_above_extractors = os.path.dirname(full_path_extractors)
+        # Modify the PATH so we can recognize this new package on import
+        if full_path_extractors not in sys.path:
+            sys.path.insert(1, full_path_extractors)
+        if full_path_above_extractors not in sys.path:
+            sys.path.insert(1, full_path_above_extractors)
+        path_extractors = os.path.realpath(path_extractors)
+        self.path: str = path_extractors
+        self.extractors: Dict[str, ExtractorRegistration] = {}
+        with Manager() as manager:
+            extractors = manager.dict()
+            namespaced_rules = manager.dict()
+            def extractor_module_callback(module: ModuleType, venv: str):
+                members = inspect.getmembers(module, predicate=utils.maco_extractor_validation)
+                for member in members:
+                    name, member = member
+                    if exclude and name in exclude:
+                        # Module is part of the exclusion list, skip
+                        logger.debug(f"exclude excluded '{name}'")
+                        return
+                    if include and name not in include:
+                        # Module wasn't part of the inclusion list, skip
+                        logger.debug(f"include excluded '{name}'")
+                        return
+                    # initialise and register
+                    logger.debug(f"register '{name}'")
+                    extractors[name] = dict(
+                        venv=venv,
+                        module_path=module.__file__,
+                        module_name=member.__module__,
+                        extractor_class=member.__name__,
+                        metadata={
+                            "family": member.family,
+                            "author": member.author,
+                            "last_modified": member.last_modified,
+                            "sharing": member.sharing,
+                            "description": member.__doc__,
+                        },
+                    )
+                    namespaced_rules[name] = member.yara_rule or extractor.DEFAULT_YARA_RULE.format(name=name)
+            # multiprocess logging is awkward - set up a queue to ensure we can log
+            logging_queue = Queue()
+            queue_handler = logging.handlers.QueueListener(logging_queue, *logging.getLogger().handlers)
+            queue_handler.start()
+            # Find the extractors within the given directory
+            # Execute within a child process to ensure main process interpreter is kept clean
+            p = Process(
+                target=utils.proxy_logging,
+                args=(
+                    logging_queue,
+                    utils.import_extractors,
+                    extractor_module_callback,
+                ),
+                kwargs=dict(
+                    root_directory=path_extractors,
+                    scanner=yara.compile(source=utils.MACO_YARA_RULE),
+                    create_venv=create_venv and os.path.isdir(path_extractors),
+                    skip_install=skip_install,
+                ),
+            )
+            p.start()
+            p.join()
+            # stop multiprocess logging
+            queue_handler.stop()
+            logging_queue.close()
+            self.extractors = dict(extractors)
+            if not self.extractors:
+                raise ExtractorLoadError("no extractors were loaded")
+            logger.debug(f"found extractors {list(self.extractors.keys())}\n")
+            # compile yara rules gathered from extractors
+            self.rules = yara.compile(sources=dict(namespaced_rules))
+    def match(self, stream: BinaryIO) -> Dict[str, List[yara.Match]]:
+        """Return extractors that should run based on yara rules."""
+        # execute yara rules on file to find extractors we should run
+        # yara can't run on a stream so we give it a bytestring
+        matches = self.rules.match(data=stream.read())
+        stream.seek(0)
+        if not matches:
+            return
+        # get all rules that hit for each extractor
+        runs = {}
+        for match in matches:
+            runs.setdefault(match.namespace, []).append(match)
+        return runs
+    def extract(
+        self,
+        stream: BinaryIO,
+        extractor_name: str,
+    ) -> Dict[str, Any]:
+        """Run extractor with stream and verify output matches the model.
+        Args:
+            stream (BinaryIO): Binary stream to analyze
+            extractor_name (str): Name of extractor to analyze stream
+        Returns:
+            (Dict[str, Any]): Results from extractor
+        """
+        extractor = self.extractors[extractor_name]
+        try:
+            # Run extractor on a copy of the sample
+            with NamedTemporaryFile() as sample_path:
+                sample_path.write(stream.read())
+                sample_path.flush()
+                # enforce types and verify properties, and remove defaults
+                return _verify_response(
+                    utils.run_extractor(
+                        sample_path.name,
+                        module_name=extractor["module_name"],
+                        extractor_class=extractor["extractor_class"],
+                        module_path=extractor["module_path"],
+                        venv=extractor["venv"],
+                    )
+                )
+        except AnalysisAbortedException:
+            # Extractor voluntarily aborted analysis of sample
+            return
+        except Exception:
+            # caller can deal with the exception
+            raise
+        finally:
+            # make sure to reset where we are in the file
+            # otherwise follow on extractors are going to read 0 bytes
+            stream.seek(0)

extractor_setup/maco/exceptions.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Exception classes for extractors."""
+# Can be raised by extractors to abort analysis of a sample
+# ie. Can abort if preliminary checks at start of run indicate the file shouldn't be analyzed by extractor
+class AnalysisAbortedException(Exception):
+    """Raised when extractors voluntarily abort analysis of a sample."""
+    pass
+class ExtractorLoadError(Exception):
+    """Raised when extractors cannot be loaded."""
+    pass
+class InvalidExtractor(ValueError):
+    """Raised when an extractor is invalid."""
+    pass
+class NoHitException(Exception):
+    """Raised when the YARA rule of an extractor doesn't hit."""
+    pass
+class SyntaxError(Exception):
+    """Raised when there's a syntax error in the YARA rule."""
+    pass

extractor_setup/maco/extractor.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Base class for an extractor script."""
+import logging
+import textwrap
+from typing import BinaryIO, List, Optional, Union
+from maco import model, yara
+from maco.exceptions import InvalidExtractor
+DEFAULT_YARA_RULE = """
+rule {name}
+{{
+    condition:
+        true
+}}
+"""
+class Extractor:
+    """Base class for an analysis extractor with common entrypoint and metadata.
+    Override this docstring with a good description of your extractor.
+    """
+    family: Union[str, List[str]] = None  # family or families of malware that is detected by the extractor
+    author: str = None  # author of the extractor (name@organisation)
+    last_modified: str = None  # last modified date (YYYY-MM-DD)
+    sharing: str = "TLP:WHITE"  # who can this be shared with?
+    yara_rule: str = None  # yara rule that we filter inputs with
+    reference: str = None  # link to malware report or other reference information
+    logger: logging.Logger = None  # logger for use when debugging
+    def __init__(self) -> None:
+        """Initialise the extractor.
+        Raises:
+            InvalidExtractor: When the extractor is invalid.
+        """
+        self.name = name = type(self).__name__
+        self.logger = logging.getLogger(f"maco.extractor.{name}")
+        self.logger.debug(f"initialise '{name}'")
+        if not self.family or not self.author or not self.last_modified:
+            raise InvalidExtractor("must set family, author, last_modified")
+        # if author does not set a yara rule, match on everything
+        if not self.yara_rule:
+            self.yara_rule = DEFAULT_YARA_RULE.format(name=name)
+        # unindent the yara rule from triple quoted string
+        # this is for friendly printing, yara handles the rule ok either way
+        self.yara_rule = textwrap.dedent(self.yara_rule)
+        # check yara rules conform to expected structure
+        # we throw away these compiled rules as we need all rules in system compiled together
+        try:
+            self.yara_compiled = yara.compile(source=self.yara_rule)
+        except yara.SyntaxError as e:
+            raise InvalidExtractor(f"{self.name} - invalid yara rule") from e
+        # need to track which plugin owns the rules
+        self.yara_rule_names = [x.identifier for x in self.yara_compiled]
+        if not len(list(self.yara_compiled)):
+            raise InvalidExtractor(f"{name} must define at least one yara rule")
+        for x in self.yara_compiled:
+            if x.is_global:
+                raise InvalidExtractor(f"{x.identifier} yara rule must not be global")
+    def run(self, stream: BinaryIO, matches: List[yara.Match]) -> Optional[model.ExtractorModel]:
+        """Run the analysis process and return dict matching.
+        :param stream: file object from disk/network/memory.
+        :param match: yara rule match information contains locations of strings.
+        """
+        raise NotImplementedError()

maco 1.2.17__py3-none-any.whl → 1.2.18__py3-none-any.whl

maco 1.2.17py3-none-any.whl → 1.2.18py3-none-any.whl