recongraph 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
recongraph/recongraph.py CHANGED
@@ -1,911 +1,911 @@
1
- import csv
2
- import re
3
- import yaml
4
- from pathlib import Path
5
- from datetime import datetime
6
- from typing import List, Dict, Any
7
- from collections import defaultdict
8
- import networkx as nx
9
- from collections import defaultdict
10
- import os
11
- import argparse
12
- import pandas as pd
13
- import re
14
- import yaml
15
- from typing import List, Dict, Any
16
- from pathlib import Path
17
-
18
-
19
- class SigmaMatcher:
20
- """
21
- Handles parsing Sigma rules from YAML files and evaluating them
22
- against normalized log entries.
23
-
24
- This class processes Sigma rule detection logic, logsource requirements,
25
- and metadata. It provides an evaluation engine that determines if a
26
- specific log entry matches the rule's criteria, supporting field
27
- modifiers and complex boolean conditions.
28
- """
29
- def __init__(self, rule_file: str, flexible_mode: bool = True):
30
- with open(rule_file, 'r', encoding='utf-8') as f:
31
- self.rule_data = yaml.safe_load(f)
32
-
33
- self.title = self.rule_data.get('title', 'Unknown')
34
- self.description = self.rule_data.get('description', '')
35
- self.level = self.rule_data.get('level', 'medium')
36
- self.tags = self.rule_data.get('tags', [])
37
- self.detection = self.rule_data.get('detection', {})
38
- self.logsource = self.rule_data.get('logsource', {})
39
- self.flexible_mode = flexible_mode
40
-
41
- def match(self, log_entry: Dict[str, Any]) -> bool:
42
- """
43
- Check if log entry matches rule.
44
-
45
- This function checks if a given log entry matches the Sigma rule's detection logic.
46
- It evaluates the conditions defined in the rule against the fields in the log entry.
47
- """
48
- if not log_entry:
49
- return False
50
-
51
- if not self.flexible_mode and self.logsource:
52
- if not self._check_logsource(log_entry):
53
- return False
54
-
55
- condition = self.detection.get('condition', '').lower().strip()
56
-
57
- selections = {}
58
- for key, value in self.detection.items():
59
- if key == 'condition':
60
- continue
61
- selections[key.lower()] = self._match_selection(value, log_entry)
62
-
63
- return self._evaluate_condition(condition, selections)
64
-
65
- def _check_logsource(self, log_entry: Dict[str, Any]) -> bool:
66
- """
67
- Check if log_entry match with the rule's expected logsource.
68
-
69
- This function validates whether the log entry originates from the log source
70
- specified in the Sigma rule (category, product, service).
71
- """
72
- expected_category = self.logsource.get("category", "").lower()
73
- expected_product = self.logsource.get("product", "").lower()
74
- expected_service = self.logsource.get("service", "").lower()
75
-
76
- log_types = log_entry.get("log_type", [])
77
- if isinstance(log_types, str):
78
- log_types = [log_types]
79
-
80
- log_types_lower = [lt.lower() for lt in log_types]
81
-
82
- if expected_category and not any(expected_category in lt for lt in log_types_lower):
83
- return False
84
-
85
- if expected_product:
86
- if expected_service:
87
- if not any(expected_service in lt for lt in log_types_lower):
88
- if not any(expected_product in lt for lt in log_types_lower):
89
- return False
90
- return True
91
-
92
- def _match_selection(self, selection, log_entry: Dict) -> bool:
93
- """
94
- Match selection with log entry.
95
-
96
- This function iterates through the selection criteria (strings, lists, or dictionaries)
97
- and checks if the log entry satisfies them. in flexible mode, it searches broadly.
98
- """
99
- search_fields = self._get_search_fields(log_entry)
100
-
101
- if isinstance(selection, list):
102
- for pattern in selection:
103
- pattern_lower = str(pattern).lower()
104
- if self._match_simple_pattern(pattern_lower, search_fields):
105
- return True
106
- return False
107
-
108
- if isinstance(selection, str):
109
- pattern_lower = str(selection).lower()
110
- return self._match_simple_pattern(pattern_lower, search_fields)
111
-
112
- if not isinstance(selection, dict):
113
- return False
114
-
115
- for field, patterns in selection.items():
116
- if field == '|all':
117
- patterns = patterns if isinstance(patterns, list) else [patterns]
118
- for pattern in patterns:
119
- pattern_lower = str(pattern).lower()
120
- if not self._match_simple_pattern(pattern_lower, search_fields):
121
- return False
122
- return True
123
-
124
- if field == '|any':
125
- patterns = patterns if isinstance(patterns, list) else [patterns]
126
- for pattern in patterns:
127
- pattern_lower = str(pattern).lower()
128
- if self._match_simple_pattern(pattern_lower, search_fields):
129
- return True
130
- return False
131
-
132
- field_name, modifier = self._parse_field(field)
133
- patterns = patterns if isinstance(patterns, list) else [patterns]
134
-
135
- log_value = self._get_field_value(log_entry, field_name)
136
-
137
- null_check_needed = any(str(p).lower() == "null" for p in patterns)
138
- if not log_value and not null_check_needed:
139
- return False
140
-
141
- null_match_found = False
142
- for p in patterns:
143
- if str(p).lower() == "null":
144
- if log_value == "":
145
- null_match_found = True
146
- else:
147
- return False
148
-
149
- if null_match_found:
150
- return True
151
-
152
- pattern_matched = False
153
- for p in patterns:
154
- if str(p).lower() == "null": continue
155
- if self._match_value(log_value, str(p).lower(), modifier):
156
- pattern_matched = True
157
- break
158
-
159
- if not pattern_matched:
160
- return False
161
-
162
- return True
163
-
164
- def _match_simple_pattern(self, pattern: str, search_fields: List[str]) -> bool:
165
- """
166
- Matches a simple pattern string against a list of search fields.
167
-
168
- This function checks if the pattern exists as a substring in any of the provided search fields.
169
- """
170
- return any(pattern in field for field in search_fields)
171
-
172
- def _get_search_fields(self, log_entry: Dict) -> List[str]:
173
- """
174
- Get all searchable fields from log entry.
175
-
176
- This function gathers values from various fields in the log entry to form a list
177
- of text strings to search against. In flexible mode, it includes almost all values.
178
- """
179
- search_fields = []
180
- if 'desc' in log_entry:
181
- search_fields.append(str(log_entry.get('desc', '')).lower())
182
-
183
- if self.flexible_mode:
184
- for k, v in log_entry.items():
185
- if k not in ['log_type'] and v:
186
- search_fields.append(str(v).lower())
187
- else:
188
- http_fields = ['c-uri', 'cs-uri-query', 'cs-user-agent', 'cs-referer', 'cs-method']
189
- for field in http_fields:
190
- if field in log_entry and log_entry[field]:
191
- search_fields.append(str(log_entry[field]).lower())
192
-
193
- extra_fields = ['command', 'commandline', 'process', 'image', 'parentimage']
194
- for field in extra_fields:
195
- if field in log_entry and log_entry[field]:
196
- search_fields.append(str(log_entry[field]).lower())
197
-
198
- return search_fields if search_fields else ['']
199
-
200
- def _get_field_value(self, log_entry: Dict, field_name: str) -> str:
201
- """
202
- Get the value of a field from the log entry.
203
-
204
- This function retrieves the value of a specific field from the log entry,
205
- handling field mapping (e.g., 'uri' -> 'c-uri') and normalizing to lowercase.
206
- """
207
- if field_name in log_entry:
208
- return str(log_entry[field_name]).lower()
209
-
210
- field_mappings = {
211
- 'uri': 'c-uri',
212
- 'url': 'c-uri',
213
- 'query': 'cs-uri-query',
214
- 'useragent': 'cs-user-agent',
215
- 'user_agent': 'cs-user-agent',
216
- 'method': 'cs-method',
217
- 'status': 'sc-status',
218
- 'message': 'desc',
219
- 'msg': 'desc',
220
- 'commandline': 'desc',
221
- 'command': 'desc',
222
- }
223
-
224
- mapped_field = field_mappings.get(field_name.lower())
225
- if mapped_field and mapped_field in log_entry:
226
- return str(log_entry[mapped_field]).lower()
227
-
228
- if self.flexible_mode and 'desc' in log_entry:
229
- return str(log_entry['desc']).lower()
230
-
231
- return ''
232
-
233
- def _parse_field(self, field: str):
234
- """
235
- Parse a field string into a tuple of (field_name, modifier).
236
-
237
- This function splits a field string like 'fieldname|modifier' into its components.
238
- """
239
- if "|" not in field:
240
- return (field, None)
241
- parts = field.split("|")
242
- return parts[0], parts[-1]
243
-
244
- def _match_value(self, value: str, pattern: str, modifier: str = None):
245
- """
246
- Match a pattern against a value based on the modifier.
247
-
248
- This function applies the specified modifier (e.g., 'contains', 'startswith')
249
- to match the pattern against the value.
250
- """
251
- if modifier == "contains": return pattern in value
252
- if modifier == "startswith": return value.startswith(pattern)
253
- if modifier == "endswith": return value.endswith(pattern)
254
- if modifier == "re": return bool(re.search(pattern, value))
255
- return value == pattern
256
-
257
- def _evaluate_condition(self, condition: str, selections: Dict[str, bool]) -> bool:
258
- """
259
- Evaluate a condition based on the selections.
260
-
261
- This function evaluates the logical condition string (e.g., 'selection1 and not selection2')
262
- using the results of the selection matching.
263
- """
264
- if not condition:
265
- return any(selections.values())
266
-
267
- condition = condition.lower().strip()
268
-
269
- def replace_x_of(match):
270
- count_str = match.group(1)
271
- prefix = match.group(2)
272
-
273
- matching_vals = [v for k, v in selections.items() if k.startswith(prefix)]
274
- if not matching_vals: return "False"
275
-
276
- if "not" in count_str:
277
- target = int(count_str.replace("not", "").strip())
278
- return str(not (sum(matching_vals) >= target))
279
- elif "all" in count_str:
280
- return str(all(matching_vals))
281
- else:
282
- target = int(count_str)
283
- return str(sum(matching_vals) >= target)
284
-
285
- condition = re.sub(r'((?:not\s+)?\d+|all)\s+of\s+(\w+)\*?', replace_x_of, condition)
286
-
287
- if "all of them" in condition: condition = condition.replace("all of them", str(all(selections.values())))
288
- if "1 of them" in condition: condition = condition.replace("1 of them", str(any(selections.values())))
289
- if "any of them" in condition: condition = condition.replace("any of them", str(any(selections.values())))
290
-
291
- for key, result in selections.items():
292
- condition = re.sub(rf"\\b{re.escape(key)}\\b", str(result), condition)
293
-
294
- try:
295
- return bool(eval(condition))
296
- except Exception:
297
- return any(selections.values())
298
-
299
-
300
- class SigmaRulesLoader:
301
- """
302
- Manages the lifecycle of Sigma rules within a specified directory.
303
-
304
- This class handles searching for, loading, and initializing Sigma rules into
305
- executable matchers. It provides a high-level interface for checking log
306
- entries against the entire rule set and managing rule-specific metadata.
307
- """
308
- def __init__(self, rules_dir: str, flexible_mode: bool = True):
309
- self.rules_dir = rules_dir
310
- self.flexible_mode = flexible_mode
311
- self.matchers = []
312
- self._load_rules()
313
-
314
- def _load_rules(self):
315
- """
316
- Loads all rules from desired directory.
317
-
318
- This function scans the specified rules directory for YAML files,
319
- creates a SigmaMatcher for each, and stores them in the matchers list.
320
- """
321
- if not self.rules_dir:
322
- print("No rules directory specified. Skipping rule loading.")
323
- return
324
-
325
- rules_path = Path(self.rules_dir)
326
- if not rules_path.exists():
327
- print(f"Rules directory {rules_path} does not exist")
328
- return
329
-
330
- mode_str = "FLEXIBLE" if self.flexible_mode else "STRICT"
331
- print(f"- Loading Sigma Rules from: {self.rules_dir} (Mode: {mode_str})")
332
-
333
- loaded_count = 0
334
- for rule_file in rules_path.glob('**/*.yml'):
335
- try:
336
- matcher = SigmaMatcher(str(rule_file), flexible_mode=self.flexible_mode)
337
- self.matchers.append({
338
- 'matcher': matcher,
339
- 'title': matcher.title,
340
- 'level': matcher.level,
341
- })
342
- loaded_count += 1
343
- except Exception:
344
- pass
345
-
346
- print(f"- Total rules loaded: {loaded_count} rules")
347
-
348
- def check_row(self, parsed_row: Dict[str, Any]) -> List[Dict[str, str]]:
349
- """
350
- Check if a row matches any of the loaded rules.
351
-
352
- This function iterates through all loaded rules and checks if the given
353
- parsed log row matches any of them. Returns a list of matching rules.
354
- """
355
- matches = []
356
- for rule_info in self.matchers:
357
- matcher = rule_info['matcher']
358
- if matcher.match(parsed_row):
359
- matches.append({
360
- 'rule_title': matcher.title,
361
- 'rule_level': matcher.level,
362
- })
363
- return matches
364
-
365
- def extract_sigma_priority(self, sigma_value: str) -> str:
366
- """
367
- Select top priority rule based on severity.
368
-
369
- This function parses a string of matched Sigma rules (formatted as
370
- 'Title[Severity] | Title[Severity]') and determines the highest priority
371
- match based on severity level.
372
- """
373
- if not sigma_value or not sigma_value.strip():
374
- return ""
375
-
376
- items = [s.strip() for s in sigma_value.split("|")]
377
- priority = {"critical": 5, "high": 4, "medium": 3, "low": 2, "informational": 1}
378
-
379
- best_item = None
380
- best_score = 0
381
-
382
- for item in items:
383
- if "[" in item and "]" in item:
384
- severity = item[item.rfind("[")+1 : item.rfind("]")].lower().strip()
385
- score = priority.get(severity, 0)
386
- if score > best_score:
387
- best_score = score
388
- best_item = item
389
-
390
- return best_item or ""
391
-
392
-
393
- class SigmaLabel(object):
394
- """
395
- Orchestrates the log labeling process using Sigma rules.
396
-
397
- This class is responsible for reading input log files (CSV or TXT),
398
- identifying the appropriate log type and source, and applying the loaded
399
- Sigma rules to each entry to generate a labeled dataset.
400
- """
401
- def __init__(self, input_file, rules_dir=None, flexible_mode=True):
402
- self.input_file = input_file
403
- self.rules_dir = rules_dir
404
- self.flexible_mode = flexible_mode
405
-
406
- def count_lines(self):
407
- """
408
- Counts the number of lines in the input file.
409
-
410
- This function reads the input file to count the total number of lines,
411
- which is useful for progress tracking.
412
- """
413
- cnt = 0
414
- try:
415
- with open(self.input_file, 'r', encoding='utf-8', errors='replace') as f:
416
- for _ in f: cnt += 1
417
- except:
418
- pass
419
- return cnt
420
-
421
- def detect_log_type(self, desc: str, filename: str) -> Dict[str, Any]:
422
- """
423
- Detects the type of log entry based on its description and filename.
424
-
425
- This function analyzes the log description and filename to categorize the log
426
- (e.g., 'webserver', 'linux', 'windows') and extracts relevant fields like
427
- HTTP methods or status codes.
428
- """
429
- parsed = {}
430
- log_types = []
431
- lower_desc = desc.lower()
432
-
433
- if 'access.log' in filename:
434
- log_types.extend(['webserver', 'proxy', 'nginx', 'apache'])
435
- self._extract_http_fields(desc, parsed)
436
-
437
- if 'auth.log' in filename:
438
- log_types.extend(['linux', 'sshd'])
439
- if 'pam' in lower_desc: log_types.append('pam')
440
- if 'syslog' in filename:
441
- log_types.extend(['syslog', 'linux'])
442
- if 'systemd' in lower_desc: log_types.append('systemd')
443
- if 'kernel' in lower_desc: log_types.append('kernel')
444
- if 'audit' in lower_desc: log_types.append('auditd')
445
-
446
- if 'windows' in filename.lower() or '.evtx' in filename.lower():
447
- log_types.append('windows')
448
- if 'sysmon' in filename.lower(): log_types.append('sysmon')
449
- if 'security' in filename.lower(): log_types.append('security')
450
- if 'system' in filename.lower(): log_types.append('system')
451
-
452
- if self._looks_like_http_log(desc):
453
- if 'webserver' not in log_types: log_types.extend(['webserver', 'generic_http'])
454
- if 'cs-method' not in parsed: self._extract_http_fields(desc, parsed)
455
-
456
- if not log_types:
457
- log_types.append('unknown')
458
-
459
- parsed['log_type'] = log_types
460
- return parsed
461
-
462
- def _looks_like_http_log(self, desc: str)-> bool:
463
- """
464
- Detects if a log entry looks like an HTTP log entry.
465
-
466
- This function uses regular expressions to check for common HTTP log patterns,
467
- such as HTTP methods, status codes, or user-agent strings.
468
- """
469
- http_indicators = [
470
- r'\b(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\b',
471
- r'HTTP/\d\.\d',
472
- r'\b(200|301|302|400|401|403|404|500)\b',
473
- r'user[_-]?agent',
474
- r'referer',
475
- ]
476
- for pattern in http_indicators:
477
- if re.search(pattern, desc, re.IGNORECASE):
478
- return True
479
- return False
480
-
481
- def _extract_http_fields(self, desc: str, parsed: Dict[str, Any]):
482
- """
483
- Extracts HTTP fields from a log entry description.
484
-
485
- This function parses the log description to extract HTTP Method, URI,
486
- Status Code, and User Agent, populating the 'parsed' dictionary.
487
- """
488
- method_match = re.search(r'\b(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\b', desc)
489
- if method_match: parsed['cs-method'] = method_match.group(1)
490
-
491
- uri_match = re.search(r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\s+([^\s]+)\s+HTTP', desc)
492
- if uri_match:
493
- parsed['c-uri'] = uri_match.group(2)
494
- else:
495
- uri_match = re.search(r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\s+([^\s\"]+)', desc)
496
- if uri_match: parsed['c-uri'] = uri_match.group(2)
497
-
498
- status_match = re.search(r'code:\s*(\d{3})', desc)
499
- if status_match: parsed['sc-status'] = status_match.group(1)
500
-
501
- ua_match = re.search(r'user_agent:\s*(.+?)(?:\s+\w+:|$)', desc)
502
- if ua_match: parsed['cs-user-agent'] = ua_match.group(1).strip()
503
-
504
- def run(self):
505
- """
506
- Processes the input file and returns a labeled DataFrame.
507
-
508
- This function orchestrates the loading of data, detection of log types,
509
- matching against Sigma rules, and generation of a labeled DataFrame.
510
- """
511
- rules_loader = SigmaRulesLoader(self.rules_dir, flexible_mode=self.flexible_mode)
512
-
513
- if not rules_loader.matchers:
514
- print("No rules loaded! Continuing without matching...")
515
-
516
- is_csv = self.input_file.endswith('.csv')
517
- df = pd.DataFrame()
518
-
519
- if is_csv:
520
- try:
521
- df = pd.read_csv(self.input_file, dtype=str)
522
- except:
523
- df = pd.read_csv(self.input_file, header=None, dtype=str)
524
- df.columns = [f'col_{i}' for i in range(len(df.columns))]
525
- else:
526
- try:
527
- with open(self.input_file, 'r', encoding='utf-8', errors='replace') as f:
528
- lines = f.readlines()
529
- df = pd.DataFrame({'description': lines})
530
- df['filename'] = self.input_file
531
- except Exception as e:
532
- print(f"Error reading file: {e}")
533
- return df
534
-
535
- processed_rows = []
536
- total_rows = len(df)
537
- print(f"- Labeling {total_rows} rows...")
538
-
539
- count = 0
540
- for _, row in df.iterrows():
541
- count += 1
542
- if count % 1000 == 0:
543
- print(f"- Processed {count}/{total_rows} lines...")
544
-
545
- desc = ""
546
- if 'message' in row: desc = str(row['message'])
547
- elif 'desc' in row: desc = str(row['desc'])
548
- elif 'description' in row: desc = str(row['description'])
549
- elif len(row) > 4 and isinstance(row.values[4], str) : desc = row.values[4]
550
- else: desc = str(row.values[0])
551
-
552
- fname = self.input_file
553
- if 'filename' in row: fname = str(row['filename'])
554
- elif 'source_short' in row: fname = str(row['source_short'])
555
- elif 'display_name' in row: fname = str(row['display_name'])
556
- elif 'source' in row: fname = str(row['source'])
557
- elif len(row) > 6 and isinstance(row.values[6], str): fname = row.values[6]
558
-
559
- features = self.detect_log_type(str(desc), str(fname))
560
-
561
- log_entry = {
562
- "desc": desc,
563
- "log_type": features["log_type"],
564
- "cs-method": features.get("cs-method", ""),
565
- "c-uri": features.get("c-uri", ""),
566
- "sc-status": features.get("sc-status", ""),
567
- "cs-user-agent": features.get("cs-user-agent", ""),
568
- "service": features.get("service", ""),
569
- }
570
-
571
- matches = rules_loader.check_row(log_entry)
572
-
573
- if matches:
574
- detection_str = " | ".join([f"{m['rule_title']}[{m['rule_level']}]" for m in matches])
575
- else:
576
- detection_str = ""
577
-
578
- new_row = row.to_dict()
579
- new_row['logsource'] = str(features['log_type'])
580
- new_row['sigma'] = rules_loader.extract_sigma_priority(detection_str)
581
- processed_rows.append(new_row)
582
-
583
- return pd.DataFrame(processed_rows)
584
-
585
-
586
- class EdgeGraph(object):
587
- """
588
- Constructs a directed graph from sigma-labeled logs to visualize system behavior.
589
-
590
- This class transforms a sequential list of security events into a
591
- MultiDiGraph where nodes represent unique event types and edges represent
592
- temporal transitions between them. It captures event frequency and
593
- associated log metadata to facilitate forensic analysis.
594
- """
595
- def __init__(self, df: pd.DataFrame):
596
- self.df = df.copy()
597
-
598
- if 'message' not in self.df.columns:
599
- if 'desc' in self.df.columns:
600
- self.df.rename(columns={'desc': 'message'}, inplace=True)
601
- elif 'description' in self.df.columns:
602
- self.df.rename(columns={'description': 'message'}, inplace=True)
603
- else:
604
- self.df['message'] = ""
605
-
606
- if 'datetime' not in self.df.columns:
607
- if 'timestamp' in self.df.columns:
608
- self.df.rename(columns={'timestamp': 'datetime'}, inplace=True)
609
- else:
610
- self.df['datetime'] = ""
611
-
612
- self.events_dict = {}
613
- self.node_labels = {}
614
- self.node_events = []
615
- self.G = nx.MultiDiGraph()
616
-
617
- self.log_event_id = []
618
- self.node_members = defaultdict(list)
619
- self.event_logs = defaultdict(list)
620
- self.event_timestamps = defaultdict(list)
621
-
622
- self.edges_list = []
623
- self.edges_weight = defaultdict(int)
624
- self.edges_weight_list = []
625
-
626
- def define_events(self):
627
- """
628
- Identify unique security events from the labeled dataset.
629
-
630
- This function iterates through the 'sigma' column to find all unique rule matches.
631
- These matches define the nodes of the graph. Each unique Sigma label
632
- becomes a distinct node in the resulting behavioral map.
633
- """
634
- lines = self.df['message'].tolist()
635
-
636
- if 'sigma' in self.df.columns:
637
- events = self.df[self.df['sigma'].notna() & (self.df['sigma'] != '')]['sigma'].unique().tolist()
638
- else:
639
- events = []
640
-
641
- self.events_dict = {}
642
- for index, event in enumerate(events):
643
- self.events_dict[event] = index
644
- self.node_labels = {}
645
- for index, event in enumerate(events):
646
- self.node_labels[index] = event
647
-
648
- self.node_events = []
649
- for index, event in enumerate(events):
650
- self.node_events.append((index, {'event': f"{str(index)}. {event}"}))
651
-
652
- def create_graph(self):
653
- """
654
- Initialize the graph with nodes.
655
-
656
- This function creates a new networkx MultiDiGraph and adds the identified events
657
- as nodes.
658
- """
659
- self.G = nx.MultiDiGraph()
660
- self.G.add_nodes_from(self.node_events)
661
- print(f"Graph nodes added: {self.G.number_of_nodes()}")
662
-
663
- def get_list_event_id(self):
664
- """
665
- Map log entries to event IDs.
666
-
667
- This function processes the DataFrame rows, identifying which event ID corresponds
668
- to each log entry based on its Sigma label, and stores this mapping.
669
- """
670
- self.log_event_id = []
671
- self.node_members = defaultdict(list)
672
- self.event_logs = defaultdict(list)
673
- self.event_timestamps = defaultdict(list)
674
-
675
- for line_id, row in self.df.iterrows():
676
- sigma_value = row.get('sigma')
677
- desc_value = row.get('message')
678
- timestamp_value = row.get('datetime')
679
-
680
- if pd.notna(sigma_value) and sigma_value != '':
681
- if sigma_value in self.events_dict:
682
- event_id = self.events_dict[sigma_value]
683
- self.log_event_id.append(event_id)
684
- self.node_members[event_id].append(line_id)
685
- self.event_logs[event_id].append(desc_value)
686
- self.event_timestamps[event_id].append(timestamp_value)
687
-
688
- def add_node_attributes(self):
689
- """
690
- Enrich nodes with attributes.
691
-
692
- This function adds metadata to each node in the graph, such as the first log snippet,
693
- timestamp, and the count of logs associated with that event.
694
- """
695
- for event_id in self.event_logs.keys():
696
- logs = self.event_logs[event_id]
697
- timestamps = self.event_timestamps[event_id]
698
-
699
-
700
- if logs:
701
- first_log = logs[0]
702
- else:
703
- first_log = ""
704
-
705
- if timestamps:
706
- first_timestamp = timestamps[0]
707
- else:
708
- first_timestamp = ""
709
-
710
-
711
- if self.G.has_node(event_id):
712
- self.G.nodes[event_id]['message'] = first_log
713
- self.G.nodes[event_id]['timestamp'] = first_timestamp
714
- self.G.nodes[event_id]['log_count'] = len(logs)
715
-
716
- def create_edges(self):
717
- """
718
- Calculate edges based on event transitions.
719
-
720
- This function iterates through the sequence of event IDs and creates edges
721
- between consecutive events, counting their occurrences to determine weights.
722
- """
723
- self.edges_list = []
724
- self.edges_weight = defaultdict(int)
725
- log_event_id_len = len(self.log_event_id)
726
-
727
- for index, event_id in enumerate(self.log_event_id):
728
- if (index + 1) < log_event_id_len:
729
- self.edges_list.append((event_id, self.log_event_id[index + 1]))
730
- self.edges_weight[(event_id, self.log_event_id[index + 1])] += 1
731
-
732
- def create_weighted_edges(self):
733
- """
734
- Format edges with weights for the graph.
735
-
736
- This function prepares the list of weighted edges to be added to the networkx graph.
737
- """
738
- self.edges_weight_list = []
739
- for edge, weight in self.edges_weight.items():
740
- self.edges_weight_list.append((edge[0], edge[1], {'weight': weight}))
741
-
742
- def add_edges_to_graph(self):
743
- """
744
- Add weighted edges to the graph.
745
-
746
- This function incorporates the calculated weighted edges into the graph structure.
747
- """
748
- self.G.add_edges_from(self.edges_weight_list)
749
-
750
- def write_to_graphml(self, output_filename="reconstruction_edge_graph.graphml"):
751
- """
752
- Save the graph to a GraphML file.
753
-
754
- This function exports the constructed graph to a file in GraphML format.
755
- """
756
- filename_graph_output = output_filename
757
- nx.write_graphml_lxml(self.G, filename_graph_output)
758
- print(f"[!] Graph saved to {filename_graph_output}")
759
- print(f"[!] Graph contains {self.G.number_of_nodes()} nodes and {self.G.number_of_edges()} edges.")
760
-
761
- def export_event_logs(self, output_filename="reconstruction_event_logs.csv"):
762
- """
763
- Exports detailed event logs to a separate CSV file.
764
-
765
- This function creates a detailed CSV report containing every log entry that
766
- contributed to the identified events.
767
- """
768
- csv_export_data = []
769
- for event_id in self.event_logs.keys():
770
- logs = self.event_logs[event_id]
771
- timestamps = self.event_timestamps[event_id]
772
-
773
- for ts, log in zip(timestamps, logs):
774
- csv_export_data.append({
775
- 'event_id': event_id,
776
- 'event_name': self.node_labels[event_id],
777
- 'timestamp': ts,
778
- 'log': log
779
- })
780
-
781
- if csv_export_data:
782
- csv_export_df = pd.DataFrame(csv_export_data)
783
- csv_filename = output_filename
784
- csv_export_df.to_csv(csv_filename, index=False)
785
- print(f"[+] Event logs also saved to: {csv_filename}")
786
- else:
787
- print("[!] No event logs to export.")
788
-
789
- def run_all(self, graph_output="reconstruction_edge_graph.graphml", csv_output=None):
790
- """
791
- Execute the full graph construction pipeline.
792
-
793
- This function will run the full graph construction pipeline which consists of 6 phases:
794
- 1. Defining Events
795
- 2. Creating Graph Nodes
796
- 3. Processing Log Events
797
- 4. Adding Node Attributes
798
- 5. Creating Edges
799
- 6. Writing Output
800
- """
801
- if self.df.empty:
802
- print("[!] DataFrame is empty. Cannot build graph.")
803
- return
804
-
805
- print("[+] Defining Events")
806
- self.define_events()
807
-
808
- if not self.events_dict:
809
- print("[!] No Sigma events found. Graph will be empty.")
810
- return
811
-
812
- print("[+] Creating Graph Nodes")
813
- self.create_graph()
814
-
815
- print("[+] Processing Log Events")
816
- self.get_list_event_id()
817
-
818
- print("[+] Adding Node Attributes")
819
- self.add_node_attributes()
820
-
821
- print("[+] Creating Edges")
822
- self.create_edges()
823
- self.create_weighted_edges()
824
- self.add_edges_to_graph()
825
-
826
- print("[+] Writing Output")
827
- self.write_to_graphml(graph_output)
828
-
829
- if csv_output:
830
- print("[+] Exporting Event Logs")
831
- self.export_event_logs(csv_output)
832
-
833
- class ReconGraph(object):
834
- """
835
- Unified facade for the complete forensic reconstruction pipeline.
836
-
837
- This class serves as the main entry point for the ReconGraph library,
838
- coordinating the transition from raw logs to labeled data and finally
839
- to a behavioral graph. It simplifies complex operations into a
840
- single automated workflow.
841
- """
842
- def __init__(self, input_file, rules_dir=None, flexible_mode=True):
843
- self.input_file = input_file
844
- self.rules_dir = rules_dir
845
- self.flexible_mode = flexible_mode
846
-
847
- def run_all(self, graph_output="reconstruction_edge_graph.graphml",
848
- csv_output=None, sigma_output=None):
849
- """
850
- Executes the full pipeline.
851
-
852
- This function will run the full execution pipeline which consists of 3 phases:
853
- 1. Sigma Labeling
854
- 2. Edge Graph Construction
855
- 3. Export
856
- """
857
- print(f"[+] Starting ReconGraph Pipeline for {self.input_file}")
858
-
859
- print("[Phase 1] Sigma Labeling")
860
- labeler = SigmaLabel(self.input_file, self.rules_dir, flexible_mode=self.flexible_mode)
861
- df_labeled = labeler.run()
862
-
863
- if sigma_output:
864
- if sigma_output == 'AUTO':
865
- base_name = os.path.splitext(os.path.basename(self.input_file))[0]
866
- final_sigma_output = f"{base_name}_sigma_labeled.csv"
867
- else:
868
- final_sigma_output = sigma_output
869
-
870
- df_labeled.to_csv(final_sigma_output, index=False)
871
- print(f"Sigma-labeled data exported to: {final_sigma_output}")
872
-
873
- print("\n[Phase 2] Edge Graph Construction")
874
- reconstruction = EdgeGraph(df_labeled)
875
- reconstruction.run_all(graph_output=graph_output, csv_output=csv_output)
876
- print("\n[✓] Pipeline Completed Successfully")
877
-
878
-
879
- def main():
880
- """
881
- Main execution entry point.
882
- Uses the ReconGraph facade to run the full pipeline.
883
- """
884
- parser = argparse.ArgumentParser(description='Reconstruct a graph from forensic timeline.')
885
- parser.add_argument('-f', '--file', required=True, help='Path to the input file (CSV or TXT)')
886
- parser.add_argument('-o', '--output', help='Output filename for the GraphML file', default='reconstruction_edge_graph.graphml')
887
- parser.add_argument('-r', '--rules', help='Path to the rules directory', default=None)
888
- parser.add_argument('--export-csv', nargs='?', const='reconstruction_event_logs.csv', default=None, help='Export detailed event logs to a separate CSV file')
889
- parser.add_argument('--export-sigma', nargs='?', const='AUTO', default=None, help='Export the sigma-labeled DataFrame to a CSV file')
890
- parser.add_argument('--strict', action='store_true', help='Disable flexible matching mode (strict validation)')
891
-
892
- args = parser.parse_args()
893
-
894
- if os.path.exists(args.file):
895
- pipeline = ReconGraph(
896
- input_file=args.file,
897
- rules_dir=getattr(args, 'rules', None),
898
- flexible_mode=not args.strict
899
- )
900
-
901
- pipeline.run_all(
902
- graph_output=args.output,
903
- csv_output=args.export_csv,
904
- sigma_output=args.export_sigma
905
- )
906
-
907
- else:
908
- print(f"[!] File {args.file} not found. Please ensure the input file is present.")
909
-
910
- if __name__ == '__main__':
911
- main()
1
+ import csv
2
+ import re
3
+ import yaml
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+ from typing import List, Dict, Any
7
+ from collections import defaultdict
8
+ import networkx as nx
9
+ from collections import defaultdict
10
+ import os
11
+ import argparse
12
+ import pandas as pd
13
+ import re
14
+ import yaml
15
+ from typing import List, Dict, Any
16
+ from pathlib import Path
17
+
18
+
19
+ class SigmaMatcher:
20
+ """
21
+ Handles parsing Sigma rules from YAML files and evaluating them
22
+ against normalized log entries.
23
+
24
+ This class processes Sigma rule detection logic, logsource requirements,
25
+ and metadata. It provides an evaluation engine that determines if a
26
+ specific log entry matches the rule's criteria, supporting field
27
+ modifiers and complex boolean conditions.
28
+ """
29
+ def __init__(self, rule_file: str, flexible_mode: bool = True):
30
+ with open(rule_file, 'r', encoding='utf-8') as f:
31
+ self.rule_data = yaml.safe_load(f)
32
+
33
+ self.title = self.rule_data.get('title', 'Unknown')
34
+ self.description = self.rule_data.get('description', '')
35
+ self.level = self.rule_data.get('level', 'medium')
36
+ self.tags = self.rule_data.get('tags', [])
37
+ self.detection = self.rule_data.get('detection', {})
38
+ self.logsource = self.rule_data.get('logsource', {})
39
+ self.flexible_mode = flexible_mode
40
+
41
+ def match(self, log_entry: Dict[str, Any]) -> bool:
42
+ """
43
+ Check if log entry matches rule.
44
+
45
+ This function checks if a given log entry matches the Sigma rule's detection logic.
46
+ It evaluates the conditions defined in the rule against the fields in the log entry.
47
+ """
48
+ if not log_entry:
49
+ return False
50
+
51
+ if not self.flexible_mode and self.logsource:
52
+ if not self._check_logsource(log_entry):
53
+ return False
54
+
55
+ condition = self.detection.get('condition', '').lower().strip()
56
+
57
+ selections = {}
58
+ for key, value in self.detection.items():
59
+ if key == 'condition':
60
+ continue
61
+ selections[key.lower()] = self._match_selection(value, log_entry)
62
+
63
+ return self._evaluate_condition(condition, selections)
64
+
65
+ def _check_logsource(self, log_entry: Dict[str, Any]) -> bool:
66
+ """
67
+ Check if log_entry match with the rule's expected logsource.
68
+
69
+ This function validates whether the log entry originates from the log source
70
+ specified in the Sigma rule (category, product, service).
71
+ """
72
+ expected_category = self.logsource.get("category", "").lower()
73
+ expected_product = self.logsource.get("product", "").lower()
74
+ expected_service = self.logsource.get("service", "").lower()
75
+
76
+ log_types = log_entry.get("log_type", [])
77
+ if isinstance(log_types, str):
78
+ log_types = [log_types]
79
+
80
+ log_types_lower = [lt.lower() for lt in log_types]
81
+
82
+ if expected_category and not any(expected_category in lt for lt in log_types_lower):
83
+ return False
84
+
85
+ if expected_product:
86
+ if expected_service:
87
+ if not any(expected_service in lt for lt in log_types_lower):
88
+ if not any(expected_product in lt for lt in log_types_lower):
89
+ return False
90
+ return True
91
+
92
+ def _match_selection(self, selection, log_entry: Dict) -> bool:
93
+ """
94
+ Match selection with log entry.
95
+
96
+ This function iterates through the selection criteria (strings, lists, or dictionaries)
97
+ and checks if the log entry satisfies them. in flexible mode, it searches broadly.
98
+ """
99
+ search_fields = self._get_search_fields(log_entry)
100
+
101
+ if isinstance(selection, list):
102
+ for pattern in selection:
103
+ pattern_lower = str(pattern).lower()
104
+ if self._match_simple_pattern(pattern_lower, search_fields):
105
+ return True
106
+ return False
107
+
108
+ if isinstance(selection, str):
109
+ pattern_lower = str(selection).lower()
110
+ return self._match_simple_pattern(pattern_lower, search_fields)
111
+
112
+ if not isinstance(selection, dict):
113
+ return False
114
+
115
+ for field, patterns in selection.items():
116
+ if field == '|all':
117
+ patterns = patterns if isinstance(patterns, list) else [patterns]
118
+ for pattern in patterns:
119
+ pattern_lower = str(pattern).lower()
120
+ if not self._match_simple_pattern(pattern_lower, search_fields):
121
+ return False
122
+ return True
123
+
124
+ if field == '|any':
125
+ patterns = patterns if isinstance(patterns, list) else [patterns]
126
+ for pattern in patterns:
127
+ pattern_lower = str(pattern).lower()
128
+ if self._match_simple_pattern(pattern_lower, search_fields):
129
+ return True
130
+ return False
131
+
132
+ field_name, modifier = self._parse_field(field)
133
+ patterns = patterns if isinstance(patterns, list) else [patterns]
134
+
135
+ log_value = self._get_field_value(log_entry, field_name)
136
+
137
+ null_check_needed = any(str(p).lower() == "null" for p in patterns)
138
+ if not log_value and not null_check_needed:
139
+ return False
140
+
141
+ null_match_found = False
142
+ for p in patterns:
143
+ if str(p).lower() == "null":
144
+ if log_value == "":
145
+ null_match_found = True
146
+ else:
147
+ return False
148
+
149
+ if null_match_found:
150
+ return True
151
+
152
+ pattern_matched = False
153
+ for p in patterns:
154
+ if str(p).lower() == "null": continue
155
+ if self._match_value(log_value, str(p).lower(), modifier):
156
+ pattern_matched = True
157
+ break
158
+
159
+ if not pattern_matched:
160
+ return False
161
+
162
+ return True
163
+
164
+ def _match_simple_pattern(self, pattern: str, search_fields: List[str]) -> bool:
165
+ """
166
+ Matches a simple pattern string against a list of search fields.
167
+
168
+ This function checks if the pattern exists as a substring in any of the provided search fields.
169
+ """
170
+ return any(pattern in field for field in search_fields)
171
+
172
+ def _get_search_fields(self, log_entry: Dict) -> List[str]:
173
+ """
174
+ Get all searchable fields from log entry.
175
+
176
+ This function gathers values from various fields in the log entry to form a list
177
+ of text strings to search against. In flexible mode, it includes almost all values.
178
+ """
179
+ search_fields = []
180
+ if 'desc' in log_entry:
181
+ search_fields.append(str(log_entry.get('desc', '')).lower())
182
+
183
+ if self.flexible_mode:
184
+ for k, v in log_entry.items():
185
+ if k not in ['log_type'] and v:
186
+ search_fields.append(str(v).lower())
187
+ else:
188
+ http_fields = ['c-uri', 'cs-uri-query', 'cs-user-agent', 'cs-referer', 'cs-method']
189
+ for field in http_fields:
190
+ if field in log_entry and log_entry[field]:
191
+ search_fields.append(str(log_entry[field]).lower())
192
+
193
+ extra_fields = ['command', 'commandline', 'process', 'image', 'parentimage']
194
+ for field in extra_fields:
195
+ if field in log_entry and log_entry[field]:
196
+ search_fields.append(str(log_entry[field]).lower())
197
+
198
+ return search_fields if search_fields else ['']
199
+
200
+ def _get_field_value(self, log_entry: Dict, field_name: str) -> str:
201
+ """
202
+ Get the value of a field from the log entry.
203
+
204
+ This function retrieves the value of a specific field from the log entry,
205
+ handling field mapping (e.g., 'uri' -> 'c-uri') and normalizing to lowercase.
206
+ """
207
+ if field_name in log_entry:
208
+ return str(log_entry[field_name]).lower()
209
+
210
+ field_mappings = {
211
+ 'uri': 'c-uri',
212
+ 'url': 'c-uri',
213
+ 'query': 'cs-uri-query',
214
+ 'useragent': 'cs-user-agent',
215
+ 'user_agent': 'cs-user-agent',
216
+ 'method': 'cs-method',
217
+ 'status': 'sc-status',
218
+ 'message': 'desc',
219
+ 'msg': 'desc',
220
+ 'commandline': 'desc',
221
+ 'command': 'desc',
222
+ }
223
+
224
+ mapped_field = field_mappings.get(field_name.lower())
225
+ if mapped_field and mapped_field in log_entry:
226
+ return str(log_entry[mapped_field]).lower()
227
+
228
+ if self.flexible_mode and 'desc' in log_entry:
229
+ return str(log_entry['desc']).lower()
230
+
231
+ return ''
232
+
233
+ def _parse_field(self, field: str):
234
+ """
235
+ Parse a field string into a tuple of (field_name, modifier).
236
+
237
+ This function splits a field string like 'fieldname|modifier' into its components.
238
+ """
239
+ if "|" not in field:
240
+ return (field, None)
241
+ parts = field.split("|")
242
+ return parts[0], parts[-1]
243
+
244
+ def _match_value(self, value: str, pattern: str, modifier: str = None):
245
+ """
246
+ Match a pattern against a value based on the modifier.
247
+
248
+ This function applies the specified modifier (e.g., 'contains', 'startswith')
249
+ to match the pattern against the value.
250
+ """
251
+ if modifier == "contains": return pattern in value
252
+ if modifier == "startswith": return value.startswith(pattern)
253
+ if modifier == "endswith": return value.endswith(pattern)
254
+ if modifier == "re": return bool(re.search(pattern, value))
255
+ return value == pattern
256
+
257
+ def _evaluate_condition(self, condition: str, selections: Dict[str, bool]) -> bool:
258
+ """
259
+ Evaluate a condition based on the selections.
260
+
261
+ This function evaluates the logical condition string (e.g., 'selection1 and not selection2')
262
+ using the results of the selection matching.
263
+ """
264
+ if not condition:
265
+ return any(selections.values())
266
+
267
+ condition = condition.lower().strip()
268
+
269
+ def replace_x_of(match):
270
+ count_str = match.group(1)
271
+ prefix = match.group(2)
272
+
273
+ matching_vals = [v for k, v in selections.items() if k.startswith(prefix)]
274
+ if not matching_vals: return "False"
275
+
276
+ if "not" in count_str:
277
+ target = int(count_str.replace("not", "").strip())
278
+ return str(not (sum(matching_vals) >= target))
279
+ elif "all" in count_str:
280
+ return str(all(matching_vals))
281
+ else:
282
+ target = int(count_str)
283
+ return str(sum(matching_vals) >= target)
284
+
285
+ condition = re.sub(r'((?:not\s+)?\d+|all)\s+of\s+(\w+)\*?', replace_x_of, condition)
286
+
287
+ if "all of them" in condition: condition = condition.replace("all of them", str(all(selections.values())))
288
+ if "1 of them" in condition: condition = condition.replace("1 of them", str(any(selections.values())))
289
+ if "any of them" in condition: condition = condition.replace("any of them", str(any(selections.values())))
290
+
291
+ for key, result in selections.items():
292
+ condition = re.sub(rf"\\b{re.escape(key)}\\b", str(result), condition)
293
+
294
+ try:
295
+ return bool(eval(condition))
296
+ except Exception:
297
+ return any(selections.values())
298
+
299
+
300
+ class SigmaRulesLoader:
301
+ """
302
+ Manages the lifecycle of Sigma rules within a specified directory.
303
+
304
+ This class handles searching for, loading, and initializing Sigma rules into
305
+ executable matchers. It provides a high-level interface for checking log
306
+ entries against the entire rule set and managing rule-specific metadata.
307
+ """
308
+ def __init__(self, rules_dir: str, flexible_mode: bool = True):
309
+ self.rules_dir = rules_dir
310
+ self.flexible_mode = flexible_mode
311
+ self.matchers = []
312
+ self._load_rules()
313
+
314
+ def _load_rules(self):
315
+ """
316
+ Loads all rules from desired directory.
317
+
318
+ This function scans the specified rules directory for YAML files,
319
+ creates a SigmaMatcher for each, and stores them in the matchers list.
320
+ """
321
+ if not self.rules_dir:
322
+ print("No rules directory specified. Skipping rule loading.")
323
+ return
324
+
325
+ rules_path = Path(self.rules_dir)
326
+ if not rules_path.exists():
327
+ print(f"Rules directory {rules_path} does not exist")
328
+ return
329
+
330
+ mode_str = "FLEXIBLE" if self.flexible_mode else "STRICT"
331
+ print(f"- Loading Sigma Rules from: {self.rules_dir} (Mode: {mode_str})")
332
+
333
+ loaded_count = 0
334
+ for rule_file in rules_path.glob('**/*.yml'):
335
+ try:
336
+ matcher = SigmaMatcher(str(rule_file), flexible_mode=self.flexible_mode)
337
+ self.matchers.append({
338
+ 'matcher': matcher,
339
+ 'title': matcher.title,
340
+ 'level': matcher.level,
341
+ })
342
+ loaded_count += 1
343
+ except Exception:
344
+ pass
345
+
346
+ print(f"- Total rules loaded: {loaded_count} rules")
347
+
348
+ def check_row(self, parsed_row: Dict[str, Any]) -> List[Dict[str, str]]:
349
+ """
350
+ Check if a row matches any of the loaded rules.
351
+
352
+ This function iterates through all loaded rules and checks if the given
353
+ parsed log row matches any of them. Returns a list of matching rules.
354
+ """
355
+ matches = []
356
+ for rule_info in self.matchers:
357
+ matcher = rule_info['matcher']
358
+ if matcher.match(parsed_row):
359
+ matches.append({
360
+ 'rule_title': matcher.title,
361
+ 'rule_level': matcher.level,
362
+ })
363
+ return matches
364
+
365
+ def extract_sigma_priority(self, sigma_value: str) -> str:
366
+ """
367
+ Select top priority rule based on severity.
368
+
369
+ This function parses a string of matched Sigma rules (formatted as
370
+ 'Title[Severity] | Title[Severity]') and determines the highest priority
371
+ match based on severity level.
372
+ """
373
+ if not sigma_value or not sigma_value.strip():
374
+ return ""
375
+
376
+ items = [s.strip() for s in sigma_value.split("|")]
377
+ priority = {"critical": 5, "high": 4, "medium": 3, "low": 2, "informational": 1}
378
+
379
+ best_item = None
380
+ best_score = 0
381
+
382
+ for item in items:
383
+ if "[" in item and "]" in item:
384
+ severity = item[item.rfind("[")+1 : item.rfind("]")].lower().strip()
385
+ score = priority.get(severity, 0)
386
+ if score > best_score:
387
+ best_score = score
388
+ best_item = item
389
+
390
+ return best_item or ""
391
+
392
+
393
+ class SigmaLabel(object):
394
+ """
395
+ Orchestrates the log labeling process using Sigma rules.
396
+
397
+ This class is responsible for reading input log files (CSV or TXT),
398
+ identifying the appropriate log type and source, and applying the loaded
399
+ Sigma rules to each entry to generate a labeled dataset.
400
+ """
401
+ def __init__(self, input_file, rules_dir=None, flexible_mode=True):
402
+ self.input_file = input_file
403
+ self.rules_dir = rules_dir
404
+ self.flexible_mode = flexible_mode
405
+
406
+ def count_lines(self):
407
+ """
408
+ Counts the number of lines in the input file.
409
+
410
+ This function reads the input file to count the total number of lines,
411
+ which is useful for progress tracking.
412
+ """
413
+ cnt = 0
414
+ try:
415
+ with open(self.input_file, 'r', encoding='utf-8', errors='replace') as f:
416
+ for _ in f: cnt += 1
417
+ except:
418
+ pass
419
+ return cnt
420
+
421
+ def detect_log_type(self, desc: str, filename: str) -> Dict[str, Any]:
422
+ """
423
+ Detects the type of log entry based on its description and filename.
424
+
425
+ This function analyzes the log description and filename to categorize the log
426
+ (e.g., 'webserver', 'linux', 'windows') and extracts relevant fields like
427
+ HTTP methods or status codes.
428
+ """
429
+ parsed = {}
430
+ log_types = []
431
+ lower_desc = desc.lower()
432
+
433
+ if 'access.log' in filename:
434
+ log_types.extend(['webserver', 'proxy', 'nginx', 'apache'])
435
+ self._extract_http_fields(desc, parsed)
436
+
437
+ if 'auth.log' in filename:
438
+ log_types.extend(['linux', 'sshd'])
439
+ if 'pam' in lower_desc: log_types.append('pam')
440
+ if 'syslog' in filename:
441
+ log_types.extend(['syslog', 'linux'])
442
+ if 'systemd' in lower_desc: log_types.append('systemd')
443
+ if 'kernel' in lower_desc: log_types.append('kernel')
444
+ if 'audit' in lower_desc: log_types.append('auditd')
445
+
446
+ if 'windows' in filename.lower() or '.evtx' in filename.lower():
447
+ log_types.append('windows')
448
+ if 'sysmon' in filename.lower(): log_types.append('sysmon')
449
+ if 'security' in filename.lower(): log_types.append('security')
450
+ if 'system' in filename.lower(): log_types.append('system')
451
+
452
+ if self._looks_like_http_log(desc):
453
+ if 'webserver' not in log_types: log_types.extend(['webserver', 'generic_http'])
454
+ if 'cs-method' not in parsed: self._extract_http_fields(desc, parsed)
455
+
456
+ if not log_types:
457
+ log_types.append('unknown')
458
+
459
+ parsed['log_type'] = log_types
460
+ return parsed
461
+
462
+ def _looks_like_http_log(self, desc: str)-> bool:
463
+ """
464
+ Detects if a log entry looks like an HTTP log entry.
465
+
466
+ This function uses regular expressions to check for common HTTP log patterns,
467
+ such as HTTP methods, status codes, or user-agent strings.
468
+ """
469
+ http_indicators = [
470
+ r'\b(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\b',
471
+ r'HTTP/\d\.\d',
472
+ r'\b(200|301|302|400|401|403|404|500)\b',
473
+ r'user[_-]?agent',
474
+ r'referer',
475
+ ]
476
+ for pattern in http_indicators:
477
+ if re.search(pattern, desc, re.IGNORECASE):
478
+ return True
479
+ return False
480
+
481
+ def _extract_http_fields(self, desc: str, parsed: Dict[str, Any]):
482
+ """
483
+ Extracts HTTP fields from a log entry description.
484
+
485
+ This function parses the log description to extract HTTP Method, URI,
486
+ Status Code, and User Agent, populating the 'parsed' dictionary.
487
+ """
488
+ method_match = re.search(r'\b(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\b', desc)
489
+ if method_match: parsed['cs-method'] = method_match.group(1)
490
+
491
+ uri_match = re.search(r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\s+([^\s]+)\s+HTTP', desc)
492
+ if uri_match:
493
+ parsed['c-uri'] = uri_match.group(2)
494
+ else:
495
+ uri_match = re.search(r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\s+([^\s\"]+)', desc)
496
+ if uri_match: parsed['c-uri'] = uri_match.group(2)
497
+
498
+ status_match = re.search(r'code:\s*(\d{3})', desc)
499
+ if status_match: parsed['sc-status'] = status_match.group(1)
500
+
501
+ ua_match = re.search(r'user_agent:\s*(.+?)(?:\s+\w+:|$)', desc)
502
+ if ua_match: parsed['cs-user-agent'] = ua_match.group(1).strip()
503
+
504
+ def run(self):
505
+ """
506
+ Processes the input file and returns a labeled DataFrame.
507
+
508
+ This function orchestrates the loading of data, detection of log types,
509
+ matching against Sigma rules, and generation of a labeled DataFrame.
510
+ """
511
+ rules_loader = SigmaRulesLoader(self.rules_dir, flexible_mode=self.flexible_mode)
512
+
513
+ if not rules_loader.matchers:
514
+ print("No rules loaded! Continuing without matching...")
515
+
516
+ is_csv = self.input_file.endswith('.csv')
517
+ df = pd.DataFrame()
518
+
519
+ if is_csv:
520
+ try:
521
+ df = pd.read_csv(self.input_file, dtype=str)
522
+ except:
523
+ df = pd.read_csv(self.input_file, header=None, dtype=str)
524
+ df.columns = [f'col_{i}' for i in range(len(df.columns))]
525
+ else:
526
+ try:
527
+ with open(self.input_file, 'r', encoding='utf-8', errors='replace') as f:
528
+ lines = f.readlines()
529
+ df = pd.DataFrame({'description': lines})
530
+ df['filename'] = self.input_file
531
+ except Exception as e:
532
+ print(f"Error reading file: {e}")
533
+ return df
534
+
535
+ processed_rows = []
536
+ total_rows = len(df)
537
+ print(f"- Labeling {total_rows} rows...")
538
+
539
+ count = 0
540
+ for _, row in df.iterrows():
541
+ count += 1
542
+ if count % 1000 == 0:
543
+ print(f"- Processed {count}/{total_rows} lines...")
544
+
545
+ desc = ""
546
+ if 'message' in row: desc = str(row['message'])
547
+ elif 'desc' in row: desc = str(row['desc'])
548
+ elif 'description' in row: desc = str(row['description'])
549
+ elif len(row) > 4 and isinstance(row.values[4], str) : desc = row.values[4]
550
+ else: desc = str(row.values[0])
551
+
552
+ fname = self.input_file
553
+ if 'filename' in row: fname = str(row['filename'])
554
+ elif 'source_short' in row: fname = str(row['source_short'])
555
+ elif 'display_name' in row: fname = str(row['display_name'])
556
+ elif 'source' in row: fname = str(row['source'])
557
+ elif len(row) > 6 and isinstance(row.values[6], str): fname = row.values[6]
558
+
559
+ features = self.detect_log_type(str(desc), str(fname))
560
+
561
+ log_entry = {
562
+ "desc": desc,
563
+ "log_type": features["log_type"],
564
+ "cs-method": features.get("cs-method", ""),
565
+ "c-uri": features.get("c-uri", ""),
566
+ "sc-status": features.get("sc-status", ""),
567
+ "cs-user-agent": features.get("cs-user-agent", ""),
568
+ "service": features.get("service", ""),
569
+ }
570
+
571
+ matches = rules_loader.check_row(log_entry)
572
+
573
+ if matches:
574
+ detection_str = " | ".join([f"{m['rule_title']}[{m['rule_level']}]" for m in matches])
575
+ else:
576
+ detection_str = ""
577
+
578
+ new_row = row.to_dict()
579
+ new_row['logsource'] = str(features['log_type'])
580
+ new_row['sigma'] = rules_loader.extract_sigma_priority(detection_str)
581
+ processed_rows.append(new_row)
582
+
583
+ return pd.DataFrame(processed_rows)
584
+
585
+
586
+ class EdgeGraph(object):
587
+ """
588
+ Constructs a directed graph from sigma-labeled logs to visualize system behavior.
589
+
590
+ This class transforms a sequential list of security events into a
591
+ MultiDiGraph where nodes represent unique event types and edges represent
592
+ temporal transitions between them. It captures event frequency and
593
+ associated log metadata to facilitate forensic analysis.
594
+ """
595
+ def __init__(self, df: pd.DataFrame):
596
+ self.df = df.copy()
597
+
598
+ if 'message' not in self.df.columns:
599
+ if 'desc' in self.df.columns:
600
+ self.df.rename(columns={'desc': 'message'}, inplace=True)
601
+ elif 'description' in self.df.columns:
602
+ self.df.rename(columns={'description': 'message'}, inplace=True)
603
+ else:
604
+ self.df['message'] = ""
605
+
606
+ if 'datetime' not in self.df.columns:
607
+ if 'timestamp' in self.df.columns:
608
+ self.df.rename(columns={'timestamp': 'datetime'}, inplace=True)
609
+ else:
610
+ self.df['datetime'] = ""
611
+
612
+ self.events_dict = {}
613
+ self.node_labels = {}
614
+ self.node_events = []
615
+ self.G = nx.MultiDiGraph()
616
+
617
+ self.log_event_id = []
618
+ self.node_members = defaultdict(list)
619
+ self.event_logs = defaultdict(list)
620
+ self.event_timestamps = defaultdict(list)
621
+
622
+ self.edges_list = []
623
+ self.edges_weight = defaultdict(int)
624
+ self.edges_weight_list = []
625
+
626
+ def define_events(self):
627
+ """
628
+ Identify unique security events from the labeled dataset.
629
+
630
+ This function iterates through the 'sigma' column to find all unique rule matches.
631
+ These matches define the nodes of the graph. Each unique Sigma label
632
+ becomes a distinct node in the resulting behavioral map.
633
+ """
634
+ lines = self.df['message'].tolist()
635
+
636
+ if 'sigma' in self.df.columns:
637
+ events = self.df[self.df['sigma'].notna() & (self.df['sigma'] != '')]['sigma'].unique().tolist()
638
+ else:
639
+ events = []
640
+
641
+ self.events_dict = {}
642
+ for index, event in enumerate(events):
643
+ self.events_dict[event] = index
644
+ self.node_labels = {}
645
+ for index, event in enumerate(events):
646
+ self.node_labels[index] = event
647
+
648
+ self.node_events = []
649
+ for index, event in enumerate(events):
650
+ self.node_events.append((index, {'event': f"{str(index)}. {event}"}))
651
+
652
+ def create_graph(self):
653
+ """
654
+ Initialize the graph with nodes.
655
+
656
+ This function creates a new networkx MultiDiGraph and adds the identified events
657
+ as nodes.
658
+ """
659
+ self.G = nx.MultiDiGraph()
660
+ self.G.add_nodes_from(self.node_events)
661
+ print(f"Graph nodes added: {self.G.number_of_nodes()}")
662
+
663
+ def get_list_event_id(self):
664
+ """
665
+ Map log entries to event IDs.
666
+
667
+ This function processes the DataFrame rows, identifying which event ID corresponds
668
+ to each log entry based on its Sigma label, and stores this mapping.
669
+ """
670
+ self.log_event_id = []
671
+ self.node_members = defaultdict(list)
672
+ self.event_logs = defaultdict(list)
673
+ self.event_timestamps = defaultdict(list)
674
+
675
+ for line_id, row in self.df.iterrows():
676
+ sigma_value = row.get('sigma')
677
+ desc_value = row.get('message')
678
+ timestamp_value = row.get('datetime')
679
+
680
+ if pd.notna(sigma_value) and sigma_value != '':
681
+ if sigma_value in self.events_dict:
682
+ event_id = self.events_dict[sigma_value]
683
+ self.log_event_id.append(event_id)
684
+ self.node_members[event_id].append(line_id)
685
+ self.event_logs[event_id].append(desc_value)
686
+ self.event_timestamps[event_id].append(timestamp_value)
687
+
688
+ def add_node_attributes(self):
689
+ """
690
+ Enrich nodes with attributes.
691
+
692
+ This function adds metadata to each node in the graph, such as the first log snippet,
693
+ timestamp, and the count of logs associated with that event.
694
+ """
695
+ for event_id in self.event_logs.keys():
696
+ logs = self.event_logs[event_id]
697
+ timestamps = self.event_timestamps[event_id]
698
+
699
+
700
+ if logs:
701
+ first_log = logs[0]
702
+ else:
703
+ first_log = ""
704
+
705
+ if timestamps:
706
+ first_timestamp = timestamps[0]
707
+ else:
708
+ first_timestamp = ""
709
+
710
+
711
+ if self.G.has_node(event_id):
712
+ self.G.nodes[event_id]['message'] = first_log
713
+ self.G.nodes[event_id]['timestamp'] = first_timestamp
714
+ self.G.nodes[event_id]['log_count'] = len(logs)
715
+
716
+ def create_edges(self):
717
+ """
718
+ Calculate edges based on event transitions.
719
+
720
+ This function iterates through the sequence of event IDs and creates edges
721
+ between consecutive events, counting their occurrences to determine weights.
722
+ """
723
+ self.edges_list = []
724
+ self.edges_weight = defaultdict(int)
725
+ log_event_id_len = len(self.log_event_id)
726
+
727
+ for index, event_id in enumerate(self.log_event_id):
728
+ if (index + 1) < log_event_id_len:
729
+ self.edges_list.append((event_id, self.log_event_id[index + 1]))
730
+ self.edges_weight[(event_id, self.log_event_id[index + 1])] += 1
731
+
732
+ def create_weighted_edges(self):
733
+ """
734
+ Format edges with weights for the graph.
735
+
736
+ This function prepares the list of weighted edges to be added to the networkx graph.
737
+ """
738
+ self.edges_weight_list = []
739
+ for edge, weight in self.edges_weight.items():
740
+ self.edges_weight_list.append((edge[0], edge[1], {'weight': weight}))
741
+
742
+ def add_edges_to_graph(self):
743
+ """
744
+ Add weighted edges to the graph.
745
+
746
+ This function incorporates the calculated weighted edges into the graph structure.
747
+ """
748
+ self.G.add_edges_from(self.edges_weight_list)
749
+
750
+ def write_to_graphml(self, output_filename="reconstruction_edge_graph.graphml"):
751
+ """
752
+ Save the graph to a GraphML file.
753
+
754
+ This function exports the constructed graph to a file in GraphML format.
755
+ """
756
+ filename_graph_output = output_filename
757
+ nx.write_graphml_lxml(self.G, filename_graph_output)
758
+ print(f"[!] Graph saved to {filename_graph_output}")
759
+ print(f"[!] Graph contains {self.G.number_of_nodes()} nodes and {self.G.number_of_edges()} edges.")
760
+
761
+ def export_event_logs(self, output_filename="reconstruction_event_logs.csv"):
762
+ """
763
+ Exports detailed event logs to a separate CSV file.
764
+
765
+ This function creates a detailed CSV report containing every log entry that
766
+ contributed to the identified events.
767
+ """
768
+ csv_export_data = []
769
+ for event_id in self.event_logs.keys():
770
+ logs = self.event_logs[event_id]
771
+ timestamps = self.event_timestamps[event_id]
772
+
773
+ for ts, log in zip(timestamps, logs):
774
+ csv_export_data.append({
775
+ 'event_id': event_id,
776
+ 'event_name': self.node_labels[event_id],
777
+ 'timestamp': ts,
778
+ 'log': log
779
+ })
780
+
781
+ if csv_export_data:
782
+ csv_export_df = pd.DataFrame(csv_export_data)
783
+ csv_filename = output_filename
784
+ csv_export_df.to_csv(csv_filename, index=False)
785
+ print(f"[+] Event logs also saved to: {csv_filename}")
786
+ else:
787
+ print("[!] No event logs to export.")
788
+
789
+ def run_all(self, graph_output="reconstruction_edge_graph.graphml", csv_output=None):
790
+ """
791
+ Execute the full graph construction pipeline.
792
+
793
+ This function will run the full graph construction pipeline which consists of 6 phases:
794
+ 1. Defining Events
795
+ 2. Creating Graph Nodes
796
+ 3. Processing Log Events
797
+ 4. Adding Node Attributes
798
+ 5. Creating Edges
799
+ 6. Writing Output
800
+ """
801
+ if self.df.empty:
802
+ print("[!] DataFrame is empty. Cannot build graph.")
803
+ return
804
+
805
+ print("[+] Defining Events")
806
+ self.define_events()
807
+
808
+ if not self.events_dict:
809
+ print("[!] No Sigma events found. Graph will be empty.")
810
+ return
811
+
812
+ print("[+] Creating Graph Nodes")
813
+ self.create_graph()
814
+
815
+ print("[+] Processing Log Events")
816
+ self.get_list_event_id()
817
+
818
+ print("[+] Adding Node Attributes")
819
+ self.add_node_attributes()
820
+
821
+ print("[+] Creating Edges")
822
+ self.create_edges()
823
+ self.create_weighted_edges()
824
+ self.add_edges_to_graph()
825
+
826
+ print("[+] Writing Output")
827
+ self.write_to_graphml(graph_output)
828
+
829
+ if csv_output:
830
+ print("[+] Exporting Event Logs")
831
+ self.export_event_logs(csv_output)
832
+
833
+ class ReconGraph(object):
834
+ """
835
+ Unified facade for the complete forensic reconstruction pipeline.
836
+
837
+ This class serves as the main entry point for the ReconGraph library,
838
+ coordinating the transition from raw logs to labeled data and finally
839
+ to a behavioral graph. It simplifies complex operations into a
840
+ single automated workflow.
841
+ """
842
+ def __init__(self, input_file, rules_dir=None, flexible_mode=True):
843
+ self.input_file = input_file
844
+ self.rules_dir = rules_dir
845
+ self.flexible_mode = flexible_mode
846
+
847
+ def run_all(self, graph_output="reconstruction_edge_graph.graphml",
848
+ csv_output=None, sigma_output=None):
849
+ """
850
+ Executes the full pipeline.
851
+
852
+ This function will run the full execution pipeline which consists of 3 phases:
853
+ 1. Sigma Labeling
854
+ 2. Edge Graph Construction
855
+ 3. Export
856
+ """
857
+ print(f"[+] Starting ReconGraph Pipeline for {self.input_file}")
858
+
859
+ print("[Phase 1] Sigma Labeling")
860
+ labeler = SigmaLabel(self.input_file, self.rules_dir, flexible_mode=self.flexible_mode)
861
+ df_labeled = labeler.run()
862
+
863
+ if sigma_output:
864
+ if sigma_output == 'AUTO':
865
+ base_name = os.path.splitext(os.path.basename(self.input_file))[0]
866
+ final_sigma_output = f"{base_name}_sigma_labeled.csv"
867
+ else:
868
+ final_sigma_output = sigma_output
869
+
870
+ df_labeled.to_csv(final_sigma_output, index=False)
871
+ print(f"Sigma-labeled data exported to: {final_sigma_output}")
872
+
873
+ print("\n[Phase 2] Edge Graph Construction")
874
+ reconstruction = EdgeGraph(df_labeled)
875
+ reconstruction.run_all(graph_output=graph_output, csv_output=csv_output)
876
+ print("\n[✓] Pipeline Completed Successfully")
877
+
878
+
879
+ def main():
880
+ """
881
+ Main execution entry point.
882
+ Uses the ReconGraph facade to run the full pipeline.
883
+ """
884
+ parser = argparse.ArgumentParser(description='Reconstruct a graph from forensic timeline.')
885
+ parser.add_argument('-f', '--file', required=True, help='Path to the input file (CSV or TXT)')
886
+ parser.add_argument('-o', '--output', help='Output filename for the GraphML file', default='reconstruction_edge_graph.graphml')
887
+ parser.add_argument('-r', '--rules', help='Path to the rules directory', default=None)
888
+ parser.add_argument('--export-csv', nargs='?', const='reconstruction_event_logs.csv', default=None, help='Export detailed event logs to a separate CSV file')
889
+ parser.add_argument('--export-sigma', nargs='?', const='AUTO', default=None, help='Export the sigma-labeled DataFrame to a CSV file')
890
+ parser.add_argument('--strict', action='store_true', help='Disable flexible matching mode (strict validation)')
891
+
892
+ args = parser.parse_args()
893
+
894
+ if os.path.exists(args.file):
895
+ pipeline = ReconGraph(
896
+ input_file=args.file,
897
+ rules_dir=getattr(args, 'rules', None),
898
+ flexible_mode=not args.strict
899
+ )
900
+
901
+ pipeline.run_all(
902
+ graph_output=args.output,
903
+ csv_output=args.export_csv,
904
+ sigma_output=args.export_sigma
905
+ )
906
+
907
+ else:
908
+ print(f"[!] File {args.file} not found. Please ensure the input file is present.")
909
+
910
+ if __name__ == '__main__':
911
+ main()