maco 1.2.17__py3-none-any.whl → 1.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,587 @@
1
+ """Common utilities shared between the MACO collector and configextractor-py."""
2
+
3
+ import importlib
4
+ import inspect
5
+ import json
6
+ import logging
7
+ import logging.handlers
8
+ import os
9
+ import re
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ import tempfile
14
+ from importlib.machinery import SourceFileLoader
15
+
16
+ from multiprocess import Process, Queue
17
+
18
+ from maco import yara
19
+
20
+ if sys.version_info >= (3, 11):
21
+ import tomllib
22
+ else:
23
+ import tomli as tomllib
24
+
25
+ from base64 import b64decode
26
+ from copy import deepcopy
27
+ from glob import glob
28
+ from logging import Logger
29
+ from types import ModuleType
30
+ from typing import Callable, Dict, List, Tuple, Union
31
+
32
+ from uv import find_uv_bin
33
+
34
+ from maco import model
35
+ from maco.exceptions import AnalysisAbortedException
36
+ from maco.extractor import Extractor
37
+
38
+ logger = logging.getLogger("maco.lib.utils")
39
+
40
+ VENV_DIRECTORY_NAME = ".venv"
41
+
42
+ RELATIVE_FROM_RE = re.compile(rb"from (\.+)")
43
+ RELATIVE_FROM_IMPORT_RE = re.compile(rb"from (\.+) import")
44
+
45
+ UV_BIN = find_uv_bin()
46
+
47
+ PIP_CMD = f"{UV_BIN} pip"
48
+ VENV_CREATE_CMD = f"{UV_BIN} venv"
49
+
50
+
51
+ class Base64Decoder(json.JSONDecoder):
52
+ """JSON decoder that also base64 encodes binary data."""
53
+
54
+ def __init__(self, *args, **kwargs):
55
+ """Initialize the decoder."""
56
+ json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
57
+
58
+ def object_hook(self, obj):
59
+ """Hook to decode base64 encoded binary data.""" # noqa: DOC201
60
+ if "__class__" not in obj:
61
+ return obj
62
+ type = obj["__class__"]
63
+ if type == "bytes":
64
+ return b64decode(obj["data"])
65
+ return obj
66
+
67
+
68
+ VENV_SCRIPT = """
69
+ import importlib
70
+ import json
71
+ import os
72
+ import sys
73
+ import logging
74
+
75
+ try:
76
+ # Respect cases where the extractor is tied to certain version of yara-python for processing
77
+ import yara
78
+ except:
79
+ # Otherwise fallback to MACO's interface for yara-python==4.5.x
80
+ from maco import yara
81
+
82
+ from base64 import b64encode
83
+
84
+ # ensure we have a logger to stderr
85
+ import logging
86
+ logger = logging.getLogger()
87
+ logger.setLevel(logging.DEBUG)
88
+ sh = logging.StreamHandler()
89
+ logger.addHandler(sh)
90
+ sh.setLevel(logging.DEBUG)
91
+ formatter = logging.Formatter(
92
+ fmt="%(asctime)s, [%(levelname)s] %(module)s.%(funcName)s: %(message)s", datefmt="%Y-%m-%d (%H:%M:%S)"
93
+ )
94
+ sh.setFormatter(formatter)
95
+
96
+ parent_package_path = "{parent_package_path}"
97
+ sys.path.insert(1, parent_package_path)
98
+ mod = importlib.import_module("{module_name}")
99
+
100
+ class Base64Encoder(json.JSONEncoder):
101
+ def default(self, o):
102
+ if isinstance(o, bytes):
103
+ return dict(__class__="bytes", data=b64encode(o).decode())
104
+ return json.JSONEncoder.default(self, o)
105
+ matches = []
106
+ if mod.{module_class}.yara_rule:
107
+ matches = yara.compile(source=mod.{module_class}.yara_rule).match("{sample_path}")
108
+ result = mod.{module_class}().run(open("{sample_path}", 'rb'), matches=matches)
109
+
110
+ with open("{output_path}", 'w') as fp:
111
+ if not result:
112
+ json.dump(dict(), fp)
113
+ else:
114
+ try:
115
+ json.dump(result.model_dump(exclude_defaults=True, exclude_none=True), fp, cls=Base64Encoder)
116
+ except AttributeError:
117
+ # venv likely has an older version of Pydantic < 2 installed
118
+ json.dump(result.dict(exclude_defaults=True, exclude_none=True), fp, cls=Base64Encoder)
119
+ """
120
+
121
+ MACO_YARA_RULE = r"""
122
+ rule MACO {
123
+ meta:
124
+ desc = "Used to match on Python files that contain MACO extractors"
125
+ strings:
126
+ $from = "from maco"
127
+ $import = "import maco"
128
+ $extractor = "Extractor"
129
+ $class = /class \w+\(([a-zA-Z.]+)?Extractor\)\:/
130
+ condition:
131
+ ($from or $import) and $extractor and $class
132
+ }
133
+ """
134
+
135
+
136
+ def maco_extractor_validation(module: ModuleType) -> bool:
137
+ """Validation function for extractors.
138
+
139
+ Returns:
140
+ (bool): True if extractor belongs to MACO, False otherwise.
141
+ """
142
+ if inspect.isclass(module):
143
+ # 'author' has to be implemented otherwise will raise an exception according to MACO
144
+ return hasattr(module, "author") and module.author
145
+ return False
146
+
147
+
148
+ def maco_extract_rules(module: Extractor) -> str:
149
+ """Extracts YARA rules from extractor.
150
+
151
+ Returns:
152
+ (str): YARA rules
153
+ """
154
+ return module.yara_rule
155
+
156
+
157
+ def scan_for_extractors(root_directory: str, scanner: yara.Rules, logger: Logger) -> Tuple[List[str], List[str]]:
158
+ """Looks for extractors using YARA rules.
159
+
160
+ Args:
161
+ root_directory (str): Root directory containing extractors
162
+ scanner (yara.Rules): Scanner to look for extractors using YARA rules
163
+ logger (Logger): Logger to use
164
+
165
+ Returns:
166
+ Tuple[List[str], List[str]]: Returns a list of extractor directories and extractor files
167
+
168
+ """
169
+ extractor_dirs = set([root_directory])
170
+ extractor_files = []
171
+
172
+ def scan_and_repair(directory, package=None):
173
+ nodes = os.listdir(directory)
174
+
175
+ if "__init__.py" in nodes and not package and "-" not in os.path.basename(directory):
176
+ # Perhaps we've found the outermost package?
177
+ package = os.path.basename(directory)
178
+
179
+ for node in nodes:
180
+ path = os.path.join(directory, node)
181
+ if node == VENV_DIRECTORY_NAME:
182
+ # Ignore looking for extractors within packages
183
+ continue
184
+ elif not node.endswith(".py") and os.path.isfile(path):
185
+ # Ignore scanning non-Python files
186
+ continue
187
+ elif node in ["setup.py"]:
188
+ # Ignore setup files and markers for package directories
189
+ continue
190
+ elif "test" in node:
191
+ # Ignore test files
192
+ continue
193
+ elif "deprecated" in node:
194
+ # Ignore deprecated files
195
+ continue
196
+
197
+ if os.path.isfile(os.path.join(directory, node)):
198
+ # Scan Python file for potential extractors
199
+ if package:
200
+ # Inspect the contents and look for any relative import issues
201
+ with open(path, "rb") as f:
202
+ data = f.read()
203
+
204
+ # Replace any relative importing with absolute
205
+ changed_imports = False
206
+ curr_dir = os.path.dirname(path)
207
+ split = curr_dir.split("/")[::-1]
208
+ for pattern in [RELATIVE_FROM_IMPORT_RE, RELATIVE_FROM_RE]:
209
+ for match in pattern.findall(data):
210
+ depth = match.count(b".")
211
+ abspath = ".".join(split[depth - 1 : split.index(package) + 1][::-1])
212
+ abspath += "." if pattern == RELATIVE_FROM_RE else ""
213
+ data = data.replace(f"from {match.decode()}".encode(), f"from {abspath}".encode(), 1)
214
+ changed_imports = True
215
+
216
+ # only write extractor files if imports were changed
217
+ if changed_imports:
218
+ with open(path, "wb") as f:
219
+ f.write(data)
220
+
221
+ if scanner.match(path):
222
+ # Add directory to list of hits for venv creation
223
+ extractor_dirs.add(directory)
224
+ extractor_files.append(os.path.realpath(path))
225
+ else:
226
+ scan_and_repair(path, package)
227
+
228
+ # Search for extractors using YARA rules
229
+ logger.info("Searching for prospective extractors based on YARA rules..")
230
+ scan_and_repair(root_directory)
231
+
232
+ return extractor_dirs, extractor_files
233
+
234
+
235
+ def _install_required_packages(create_venv: bool, directories: List[str], python_version: str, logger: Logger):
236
+ venvs = []
237
+ env = deepcopy(os.environ)
238
+ stop_directory = os.path.dirname(sorted(directories)[0])
239
+ # Track directories that we've already visited
240
+ visited_dirs = []
241
+ for dir in directories:
242
+ # Recurse backwards through the directory structure to look for package requirements
243
+ while dir != stop_directory and dir not in visited_dirs:
244
+ req_files = list({"requirements.txt", "pyproject.toml"}.intersection(set(os.listdir(dir))))
245
+ if req_files:
246
+ # create a virtual environment, otherwise directly install into current env
247
+ if create_venv:
248
+ venv_path = os.path.join(dir, VENV_DIRECTORY_NAME)
249
+ logger.info(f"Updating virtual environment {venv_path}")
250
+ env.update({"VIRTUAL_ENV": venv_path})
251
+ # Create a virtual environment for the directory
252
+ if not os.path.exists(venv_path):
253
+ cmd = f"{VENV_CREATE_CMD} --python {python_version}"
254
+ subprocess.run(cmd.split(" ") + [venv_path], capture_output=True, env=env)
255
+
256
+ # Install/Update the packages in the environment
257
+ install_command = PIP_CMD.split(" ") + ["install"]
258
+ # When running locally, only install packages to required spec.
259
+ # This prevents issues during maco development and building extractors against local libraries.
260
+ if create_venv:
261
+ # when running in custom virtual environment, always upgrade packages.
262
+ install_command.extend(["--upgrade", "--no-cache"])
263
+
264
+ # Update the pip install command depending on where the dependencies are coming from
265
+ if "requirements.txt" in req_files:
266
+ # Perform a pip install using the requirements flag
267
+ install_command.extend(["--requirements", "requirements.txt"])
268
+ elif "pyproject.toml" in req_files:
269
+ # Assume we're dealing with a project directory
270
+ pyproject_command = ["--editable", "."]
271
+
272
+ # Check to see if there are optional dependencies required
273
+ with open(os.path.join(dir, "pyproject.toml"), "rb") as f:
274
+ parsed_toml_project = tomllib.load(f).get("project", {})
275
+ for dep_name, dependencies in parsed_toml_project.get("optional-dependencies", {}).items():
276
+ # Look for the dependency that hints at use of MACO for the extractors
277
+ if "maco" in " ".join(dependencies):
278
+ pyproject_command = [f".[{dep_name}]"]
279
+ break
280
+
281
+ install_command.extend(pyproject_command)
282
+
283
+ # Always require maco-extractor to be installed
284
+ install_command.append("maco-extractor")
285
+ logger.debug(f"Install command: {' '.join(install_command)} [{dir}]")
286
+ # this uses VIRTUAL_ENV to control usage of a virtual environment
287
+ p = subprocess.run(
288
+ install_command,
289
+ cwd=dir,
290
+ capture_output=True,
291
+ env=env,
292
+ )
293
+ if p.returncode != 0:
294
+ if b"is being installed using the legacy" in p.stderr:
295
+ # Ignore these types of errors
296
+ continue
297
+ logger.error(f"Error installing into venv:\n{p.stdout.decode()}\n{p.stderr.decode()}")
298
+ else:
299
+ logger.debug(f"Installed dependencies into venv:\n{p.stdout.decode()}\n{p.stderr.decode()}")
300
+ if create_venv:
301
+ venvs.append(venv_path)
302
+
303
+ # Cleanup any build directories that are the product of package installation
304
+ expected_build_path = os.path.join(dir, "build")
305
+ if os.path.exists(expected_build_path):
306
+ shutil.rmtree(expected_build_path)
307
+
308
+ # Add directories to our visited list and check the parent of this directory on the next loop
309
+ visited_dirs.append(dir)
310
+ dir = os.path.dirname(dir)
311
+ return venvs
312
+
313
+
314
+ def find_and_insert_venv(path: str, venvs: List[str]) -> Tuple[str, str]:
315
+ """Finds the closest virtual environment to the extractor and inserts it into the PATH.
316
+
317
+ Args:
318
+ path (str): Path of extractor
319
+ venvs (List[str]): List of virtual environments
320
+
321
+ Returns:
322
+ (Tuple[str, str]): Virtual environment and site-packages path that's closest to the extractor
323
+ """
324
+ venv = None
325
+ for venv in sorted(venvs, reverse=True):
326
+ venv_parent = os.path.dirname(venv)
327
+ if path.startswith(f"{venv_parent}/"):
328
+ # Found the virtual environment that's the closest to extractor
329
+ break
330
+
331
+ if not venv:
332
+ return None, None
333
+
334
+ if venv:
335
+ # Insert the venv's site-packages into the PATH temporarily to load the module
336
+ for site_package in glob(os.path.join(venv, "lib/python*/site-packages")):
337
+ if site_package not in sys.path:
338
+ sys.path.insert(2, site_package)
339
+ break
340
+
341
+ return venv, site_package
342
+
343
+
344
+ def register_extractor_module(
345
+ extractor_source_file: str,
346
+ module_name: str,
347
+ venvs: List[str],
348
+ extractor_module_callback: Callable[[ModuleType, str], None],
349
+ logger: Logger,
350
+ ):
351
+ """Register the extractor module in isolation.
352
+
353
+ Args:
354
+ extractor_source_file (str): Path to source file of extractor
355
+ module_name (str): The name of the module relative to the package directory
356
+ venvs (List[str]): List of virtual environments
357
+ extractor_module_callback (Callable[[ModuleType, str], None]): Callback used to register extractors
358
+ logger (Logger): Logger to use
359
+
360
+ """
361
+ try:
362
+ logger.info(f"Inspecting '{extractor_source_file}' for extractors..")
363
+ venv, site_packages = find_and_insert_venv(extractor_source_file, venvs)
364
+ loader = SourceFileLoader(
365
+ module_name,
366
+ extractor_source_file,
367
+ )
368
+ extractor_module_callback(loader.load_module(), venv)
369
+ finally:
370
+ # Cleanup virtual environment that was loaded into PATH
371
+ if venv and site_packages in sys.path:
372
+ sys.path.remove(site_packages)
373
+
374
+
375
+ def register_extractors(
376
+ current_directory: str,
377
+ venvs: List[str],
378
+ extractor_files: List[str],
379
+ extractor_module_callback: Callable[[ModuleType, str], None],
380
+ logger: Logger,
381
+ ):
382
+ """Register extractors with in the current directory.
383
+
384
+ Args:
385
+ current_directory (str): Current directory to register extractors found
386
+ venvs (List[str]): List of virtual environments
387
+ extractor_files (List[str]): List of extractor files found
388
+ extractor_module_callback (Callable[[ModuleType, str], None]): Callback used to register extractors
389
+ logger (Logger): Logger to use
390
+ """
391
+ package_name = os.path.basename(current_directory)
392
+ parent_directory = os.path.dirname(current_directory)
393
+ if venvs and package_name in sys.modules:
394
+ # this may happen as part of testing if some part of the extractor code was directly imported
395
+ logger.warning(
396
+ f"Looks like {package_name} is already loaded. "
397
+ "If your maco extractor overlaps an existing package name this could cause problems."
398
+ )
399
+
400
+ try:
401
+ # Modify the PATH so we can recognize this new package on import
402
+ sys.path.insert(1, current_directory)
403
+ sys.path.insert(1, parent_directory)
404
+
405
+ # Load the potential extractors directly from the source file
406
+ registration_processes = []
407
+ for extractor_source_file in extractor_files:
408
+ module_name = extractor_source_file.replace(f"{parent_directory}/", "").replace("/", ".")[:-3]
409
+ p = Process(
410
+ target=register_extractor_module,
411
+ args=(extractor_source_file, module_name, venvs, extractor_module_callback, logger),
412
+ )
413
+ p.start()
414
+ registration_processes.append(p)
415
+
416
+ for p in registration_processes:
417
+ p.join()
418
+
419
+ finally:
420
+ # Cleanup changes made to PATH
421
+ sys.path.remove(parent_directory)
422
+ sys.path.remove(current_directory)
423
+
424
+
425
+ def proxy_logging(queue: Queue, callback: Callable[[ModuleType, str], None], *args, **kwargs):
426
+ """Ensures logging is set up correctly for a child process and then executes the callback."""
427
+ logger = logging.getLogger()
428
+ qh = logging.handlers.QueueHandler(queue)
429
+ qh.setLevel(logging.DEBUG)
430
+ logger.addHandler(qh)
431
+ callback(*args, **kwargs, logger=logger)
432
+
433
+
434
+ def import_extractors(
435
+ extractor_module_callback: Callable[[ModuleType, str], bool],
436
+ *,
437
+ root_directory: str,
438
+ scanner: yara.Rules,
439
+ create_venv: bool,
440
+ logger: Logger,
441
+ python_version: str = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
442
+ skip_install: bool = False,
443
+ ):
444
+ """Import extractors in a given directory.
445
+
446
+ Args:
447
+ extractor_module_callback (Callable[[ModuleType, str], bool]): Callback used to register extractors
448
+ root_directory (str): Root directory to look for extractors
449
+ scanner (yara.Rules): Scanner to look for extractors that match YARA rule
450
+ create_venv (bool): Create/Use virtual environments
451
+ logger (Logger): Logger to use
452
+ python_version (str): Version of python to use when creating virtual environments
453
+ skip_install (bool): Skip installation of Python dependencies for extractors
454
+ """
455
+ extractor_dirs, extractor_files = scan_for_extractors(root_directory, scanner, logger)
456
+
457
+ logger.info(f"Extractor files found based on scanner ({len(extractor_files)}).")
458
+ logger.debug(extractor_files)
459
+
460
+ if not skip_install:
461
+ # Install packages into the current environment or dynamically created virtual environments
462
+ venvs = _install_required_packages(create_venv, extractor_dirs, python_version, logger)
463
+ else:
464
+ # Look for pre-existing virtual environments, if any
465
+ logger.info("Checking for pre-existing virtual environment(s)..")
466
+ venvs = [
467
+ os.path.join(root, VENV_DIRECTORY_NAME)
468
+ for root, dirs, _ in os.walk(root_directory)
469
+ if VENV_DIRECTORY_NAME in dirs
470
+ ]
471
+
472
+ # With the environment prepared, we can now hunt for the extractors and register them
473
+ logger.info("Registering extractors..")
474
+ register_extractors(root_directory, venvs, extractor_files, extractor_module_callback, logger)
475
+
476
+
477
+ # holds cached extractors when not running in venv mode
478
+ _loaded_extractors: Dict[str, Extractor] = {}
479
+
480
+
481
+ def run_extractor(
482
+ sample_path,
483
+ module_name,
484
+ extractor_class,
485
+ module_path,
486
+ venv,
487
+ venv_script=VENV_SCRIPT,
488
+ json_decoder=Base64Decoder,
489
+ ) -> Union[Dict[str, dict], model.ExtractorModel]:
490
+ """Runs the maco extractor against sample either in current process or child process.
491
+
492
+ Args:
493
+ sample_path (str): Path to sample
494
+ module_name (str): Name of extractor module
495
+ extractor_class (str): Name of extractor class in module
496
+ module_path (str): Path to Python module containing extractor
497
+ venv (str): Path to virtual environment associated to extractor
498
+ venv_script (str): Script to run extractor in a virtual environment
499
+ json_decoder (Base64Decoder): Decoder used for JSON
500
+
501
+ Raises:
502
+ AnalysisAbortedException: Raised when extractor voluntarily terminates execution
503
+ Exception: Raised when extractor raises an exception
504
+
505
+ Returns:
506
+ Union[Dict[str, dict], model.ExtractorModel]: Results from extractor
507
+ """
508
+ if not venv:
509
+ key = f"{module_name}_{extractor_class}"
510
+ if key not in _loaded_extractors:
511
+ # dynamic import of extractor
512
+ try:
513
+ # Add the correct directory to the PATH before attempting to load the extractor
514
+ import_path = module_path[: -4 - len(module_name)]
515
+ sys.path.insert(1, import_path)
516
+ mod = importlib.import_module(module_name)
517
+ extractor_cls = mod.__getattribute__(extractor_class)
518
+ extractor = extractor_cls()
519
+
520
+ # Add to cache
521
+ _loaded_extractors[key] = extractor
522
+ finally:
523
+ sys.path.pop(1)
524
+
525
+ else:
526
+ # retrieve cached extractor
527
+ extractor = _loaded_extractors[key]
528
+ if extractor.yara_compiled:
529
+ matches = extractor.yara_compiled.match(sample_path)
530
+ loaded = extractor.run(open(sample_path, "rb"), matches=matches)
531
+ else:
532
+ # execute extractor in child process with separate virtual environment
533
+ # Write temporary script in the same directory as extractor to resolve relative imports
534
+ python_exe = os.path.join(venv, "bin", "python")
535
+ dirname = os.path.dirname(module_path)
536
+ with tempfile.NamedTemporaryFile("w", dir=dirname, suffix=".py") as script:
537
+ with tempfile.NamedTemporaryFile() as output:
538
+ parent_package_path = dirname.rsplit(module_name.split(".", 1)[0], 1)[0]
539
+ root_directory = module_path[:-3].rsplit(module_name.split(".", 1)[1].replace(".", "/"))[0]
540
+
541
+ script.write(
542
+ venv_script.format(
543
+ parent_package_path=parent_package_path,
544
+ module_name=module_name,
545
+ module_class=extractor_class,
546
+ sample_path=sample_path,
547
+ output_path=output.name,
548
+ )
549
+ )
550
+ script.flush()
551
+ cwd = root_directory
552
+ custom_module = script.name[:-3].replace(root_directory, "").replace("/", ".")
553
+
554
+ if custom_module.startswith("src."):
555
+ # src layout found, which means the actual module content is within 'src' directory
556
+ custom_module = custom_module[4:]
557
+ cwd = os.path.join(cwd, "src")
558
+
559
+ # run the maco extractor in full venv process isolation (slow)
560
+ proc = subprocess.run(
561
+ [python_exe, "-m", custom_module],
562
+ cwd=cwd,
563
+ capture_output=True,
564
+ )
565
+ stderr = proc.stderr.decode()
566
+ try:
567
+ # Load results and return them
568
+ output.seek(0)
569
+ loaded = json.load(output, cls=json_decoder)
570
+ except Exception as e:
571
+ # If there was an error raised during runtime, then propagate
572
+ delim = f'File "{module_path}"'
573
+ exception = stderr
574
+ if delim in exception:
575
+ exception = f"{delim}{exception.split(delim, 1)[1]}"
576
+ if "maco.exceptions.AnalysisAbortedException" in exception:
577
+ # Extractor voluntarily terminated, re-raise exception to be handled by collector
578
+ raise AnalysisAbortedException(
579
+ exception.split("maco.exceptions.AnalysisAbortedException: ")[-1]
580
+ )
581
+ else:
582
+ # print extractor logging at error level
583
+ logger.error(f"maco extractor raised exception, stderr:\n{stderr}")
584
+ raise Exception(exception) from e
585
+ # ensure that extractor logging is available
586
+ logger.info(f"maco extractor stderr:\n{stderr}")
587
+ return loaded